Click here to Skip to main content
65,938 articles
CodeProject is changing. Read more.
Articles
(untagged)

PreParse XML using CString

0.00/5 (No votes)
4 Mar 2004 1  
An article on XML parsing using CString.

Introduction

It is possible that when you parse an XML-like document, it can't be opened by XML DOM without any pre-parsing, because it has some duplicate attr-value pairs, or some spaces were omitted unexpectedly by the author.

Background

I have a lot of XML docs which were composed and unloaded by clients, but there is always some error in it.

Using the code

You read out the XML string buf, to say, strxml. Then just call PreFormatXML(strxml); after that, you can create an instance of xmldom, and call LoadXML.

The following are the functions involved:

void PreFormatXML(CString& strxml)
{
    // it is possible that the tooltip include a \r\n

    strxml.Remove(_T('\r'));
    strxml.Replace(_T('\n'),_T(' '));
    strxml.Replace(_T('\t'),_T(' '));
    //Dump(strxml,_T("e:\\bbb.xml"));


    //////////////////////// in the following, you should reassign \r\n

    ////////// now just one line

    int iStart = 0;
    int iEnd1,iEnd2;
    while(iStart>-1)
    {
        iEnd1 = strxml.Find(_T("<!--"),iStart);
        if(iEnd1<0) 
            break;
        iEnd2 = strxml.Find(_T("-->"),iEnd1);
        if(iEnd2<0) 
            break;

        if(iEnd2>iEnd1) {
            strtemp = strxml.Mid(iEnd1,iEnd2-iEnd1+3);
            int n = strxml.Replace(strtemp,_T(" "));
            iStart = 0;
            continue;
        }
        iStart = iEnd2+1;
    }

    iStart  = 0;
    iEnd1    = 0;
    iEnd2    = 0;
    CString strcmp = strxml;
    while(iEnd2>-1)
    {
        iEnd1 = strxml.Find(_T('\"'),iStart);
        iEnd2 = strxml.Find(_T('\"'),iEnd1+1);
        if(iEnd1>-1&&iEnd2>-1) {
            for(int i=iEnd1;i<iEnd2+1;i++)
                strcmp.SetAt(i,_T('*'));
        }
        else
            break;
        iStart = iEnd2+1;
    }

    strcmp.MakeLower();

    // minimize all tag and split concat attr

    int i1 = 0;
    int i2 = 0;
    int nlen = strcmp.GetLength();
    while(i1<nlen)
    {
        TCHAR ch1 = strcmp.GetAt(i1);
        if(ch1!=_T('*')) 
            strxml.SetAt(i2,ch1);

        if(ch1==_T('>'))
        {
            strxml.Insert(i2+1,_T('\n'));
            strxml.Insert(i2+1,_T('\r'));
            i2 += 2;
        }

        if(ch1==_T('*')) 
        {
            if(i1<nlen-1)
            {
                TCHAR ch2 = strcmp.GetAt(i1+1);
                if(ch2!=_T('*'))
                {
                    /// the 

                    if(ch2>_T('a'-1)&&ch2<_T('z'+1))
                    {
                        strxml.Insert(i2+1,_T(' '));
                        i2++;
                    }
                }
            }
        }
        i1++;
        i2++;
    }

    ////// the following remove duplicate tag, and makelower

    //     of everything except attribute value.

    ////int nlen;

    nlen = strxml.GetLength();
    strxml.Insert(nlen,_T("\r\n"));
    ////Dump(strxml,_T("e:\\aaaa.txt"));

    
    CString strtemp,strfake,strleft,strright;

    ///////////////////////////////////////////////////////////////////

    int size = m_ArrPreDefTag.GetSize();
    preTag pa;
    CString strnodename,strattr;

    iStart    = iEnd1    = iEnd2    = 0;
    while(iEnd1>-1)
    {
        nlen  = strxml.GetLength();
        iEnd1 = strxml.Find(_T('<'),iStart);
        if(iEnd1<0)
            break;
        iEnd2 = strxml.Find(_T(">\r\n"),iEnd1);
        if(iEnd2<0)
            break;
            
        //// keep left and right

        strleft.Empty();
        strright.Empty();
        strleft = strxml.Left(iEnd1+1);
        strright= strxml.Right(nlen-iEnd2);
        // pick out <> and process it

        strtemp = strxml.Mid(iEnd1+1,iEnd2-iEnd1-1);
        strfake = strtemp;
        strfake.TrimLeft();
        strfake.TrimRight();

        int lensub = strfake.GetLength();
        if(lensub>0) 
        {
            bool bselfClosed    = _T('/')==strfake.GetAt(lensub-1);
            if(bselfClosed)
                strfake = strfake.Left(lensub-1);

            strfake.TrimLeft();
            strfake.TrimRight();
            bool breversetag    =_T('/')==strfake.GetAt(0);
            if(breversetag) 
            {
                strfake = strfake.Right(lensub-1);
                int n1 = strfake.Find(_T(' '));

                // just truncate it. reverse tag has no attr-value pair

                if(n1>1)
                    strfake = strfake.Left(n1-1);        

                for(int isize=0;isize<size;isize++)
                {
                    pa = m_ArrPreDefTag[isize];
                    if(strfake.CompareNoCase(pa.tag)==0)
                    {
                        strfake = pa.oritag;
                        break;
                    }
                }
                strfake.MakeLower();
            }
            else
            {
                /// replace tag with predefined tag

                int n0 = strfake.GetLength();
                int n1 = strfake.Find(_T(' '));
                if(n1>0) /// yeah, it include serveral fields

                {
                    strnodename = strfake.Left(n1);
                    strnodename.MakeLower();
                    for(int isize=0;isize<size;isize++)
                    {
                        pa = m_ArrPreDefTag[isize];
                        if(strnodename.CompareNoCase(pa.tag)==0)
                        {
                            strnodename = pa.oritag + _T(" ") + pa.preattrs;
                            break;
                        }
                    }
                    // reconcat 

                    // find out nodename, attr-pair;

                    ATLASSERT(n0>n1);
                    strfake = strnodename+strfake.Right(n0-n1);
                    // breplaced is possible to change in this tag.

                    RemoveDuplicate(strfake);
                }
                ///only one tag, and no attr-value pair


            }
            strtemp = ((breversetag&&!bselfClosed)?_T("/"):_T("")) + 
                strfake +     ((bselfClosed&&!breversetag)?_T("/"):_T("")) ;
            strxml = strleft + strtemp    +     strright;
            iEnd2    = strtemp.GetLength()+iEnd1;
        }
        else
            ATLASSERT(0);        // there must be no < > things 

        iStart = iEnd2+3;
    }
    return;
}

void RemoveDuplicate(CString& str)
{
    str.TrimLeft();
    str.TrimRight();

    // replace simplified tag and spawn attribute array.

    // fill out attr-pair map;


    CString strnodename;
    int n0 = str.GetLength();
    int n1 = str.Find(_T(' '));
    if(n1>0) 
    {
        strnodename = str.Left(n1);
        strnodename.MakeLower();

        CString strfake;
        CSimpleMap<CString,CString> attributes;
        CString strattr,strvalue;

        strfake = str.Right(n0-n1);
        strfake.TrimLeft();
        strfake.TrimRight();
        /// find attr-value in strfake


        int m0 = 0;
        int m1 = 0;
        int m2 = 0;
        int mlen;
        ////////////////////

        while(m0>-1)
        {
            mlen = strfake.GetLength();
            m1 = strfake.Find(_T('\"'),m0);
            if(m1<0)
                break;

            m2 = strfake.Find(_T('\"'),m1+1);
            if(m1<0)
                break;

            strattr  = strfake.Mid(m0,m1-m0-1);
            strattr.Remove(_T('='));
            strattr.MakeLower();
            strattr.TrimLeft();
            strattr.TrimRight();

            strvalue = strfake.Mid(m1+1,m2-m1-1);
            strvalue.TrimLeft();
            strvalue.TrimRight();
            int nd = attributes.FindKey(strattr);
            if(nd<0)
                attributes.Add(strattr,strvalue);
            m0 = m2+1;
        }
        /// process default id


        str = strnodename;
        int size = attributes.GetSize();
        for(int i=0;i<size;i++)
        {
            strattr     = attributes.GetKeyAt(i);
            strvalue    = attributes.GetValueAt(i);
            str += _T(" ");
            str += strattr;
            str += _T("=\"");
            str += strvalue;
            str += _T("\"");
        }
        attributes.RemoveAll();
    }
    return;
}

As above, you can see, I have recomposed all of the attr-value pair using CSimpleMap. When encountered a duplicate pair, it will be overwritten. Another thing, in strings such as <a b="blah"c="interesting value">, the space between "blah" and c has been added, or you cannot load successfully.

This code has been tested under XP, VS.NET 2002, WTL 71, ATL70. Any comment is appreciated.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here