Introduction
It is possible that when you parse an XML-like document, it can't be opened by XML DOM without any pre-parsing, because it has some duplicate attr-value pairs, or some spaces were omitted unexpectedly by the author.
Background
I have a lot of XML docs which were composed and unloaded by clients, but there is always some error in it.
Using the code
You read out the XML string buf
, to say, strxml
. Then just call PreFormatXML(strxml)
; after that, you can create an instance of xmldom
, and call LoadXML
.
The following are the functions involved:
void PreFormatXML(CString& strxml)
{
strxml.Remove(_T('\r'));
strxml.Replace(_T('\n'),_T(' '));
strxml.Replace(_T('\t'),_T(' '));
int iStart = 0;
int iEnd1,iEnd2;
while(iStart>-1)
{
iEnd1 = strxml.Find(_T("<!--"),iStart);
if(iEnd1<0)
break;
iEnd2 = strxml.Find(_T("-->"),iEnd1);
if(iEnd2<0)
break;
if(iEnd2>iEnd1) {
strtemp = strxml.Mid(iEnd1,iEnd2-iEnd1+3);
int n = strxml.Replace(strtemp,_T(" "));
iStart = 0;
continue;
}
iStart = iEnd2+1;
}
iStart = 0;
iEnd1 = 0;
iEnd2 = 0;
CString strcmp = strxml;
while(iEnd2>-1)
{
iEnd1 = strxml.Find(_T('\"'),iStart);
iEnd2 = strxml.Find(_T('\"'),iEnd1+1);
if(iEnd1>-1&&iEnd2>-1) {
for(int i=iEnd1;i<iEnd2+1;i++)
strcmp.SetAt(i,_T('*'));
}
else
break;
iStart = iEnd2+1;
}
strcmp.MakeLower();
int i1 = 0;
int i2 = 0;
int nlen = strcmp.GetLength();
while(i1<nlen)
{
TCHAR ch1 = strcmp.GetAt(i1);
if(ch1!=_T('*'))
strxml.SetAt(i2,ch1);
if(ch1==_T('>'))
{
strxml.Insert(i2+1,_T('\n'));
strxml.Insert(i2+1,_T('\r'));
i2 += 2;
}
if(ch1==_T('*'))
{
if(i1<nlen-1)
{
TCHAR ch2 = strcmp.GetAt(i1+1);
if(ch2!=_T('*'))
{
if(ch2>_T('a'-1)&&ch2<_T('z'+1))
{
strxml.Insert(i2+1,_T(' '));
i2++;
}
}
}
}
i1++;
i2++;
}
nlen = strxml.GetLength();
strxml.Insert(nlen,_T("\r\n"));
CString strtemp,strfake,strleft,strright;
int size = m_ArrPreDefTag.GetSize();
preTag pa;
CString strnodename,strattr;
iStart = iEnd1 = iEnd2 = 0;
while(iEnd1>-1)
{
nlen = strxml.GetLength();
iEnd1 = strxml.Find(_T('<'),iStart);
if(iEnd1<0)
break;
iEnd2 = strxml.Find(_T(">\r\n"),iEnd1);
if(iEnd2<0)
break;
strleft.Empty();
strright.Empty();
strleft = strxml.Left(iEnd1+1);
strright= strxml.Right(nlen-iEnd2);
strtemp = strxml.Mid(iEnd1+1,iEnd2-iEnd1-1);
strfake = strtemp;
strfake.TrimLeft();
strfake.TrimRight();
int lensub = strfake.GetLength();
if(lensub>0)
{
bool bselfClosed = _T('/')==strfake.GetAt(lensub-1);
if(bselfClosed)
strfake = strfake.Left(lensub-1);
strfake.TrimLeft();
strfake.TrimRight();
bool breversetag =_T('/')==strfake.GetAt(0);
if(breversetag)
{
strfake = strfake.Right(lensub-1);
int n1 = strfake.Find(_T(' '));
if(n1>1)
strfake = strfake.Left(n1-1);
for(int isize=0;isize<size;isize++)
{
pa = m_ArrPreDefTag[isize];
if(strfake.CompareNoCase(pa.tag)==0)
{
strfake = pa.oritag;
break;
}
}
strfake.MakeLower();
}
else
{
int n0 = strfake.GetLength();
int n1 = strfake.Find(_T(' '));
if(n1>0)
{
strnodename = strfake.Left(n1);
strnodename.MakeLower();
for(int isize=0;isize<size;isize++)
{
pa = m_ArrPreDefTag[isize];
if(strnodename.CompareNoCase(pa.tag)==0)
{
strnodename = pa.oritag + _T(" ") + pa.preattrs;
break;
}
}
ATLASSERT(n0>n1);
strfake = strnodename+strfake.Right(n0-n1);
RemoveDuplicate(strfake);
}
}
strtemp = ((breversetag&&!bselfClosed)?_T("/"):_T("")) +
strfake + ((bselfClosed&&!breversetag)?_T("/"):_T("")) ;
strxml = strleft + strtemp + strright;
iEnd2 = strtemp.GetLength()+iEnd1;
}
else
ATLASSERT(0);
iStart = iEnd2+3;
}
return;
}
void RemoveDuplicate(CString& str)
{
str.TrimLeft();
str.TrimRight();
CString strnodename;
int n0 = str.GetLength();
int n1 = str.Find(_T(' '));
if(n1>0)
{
strnodename = str.Left(n1);
strnodename.MakeLower();
CString strfake;
CSimpleMap<CString,CString> attributes;
CString strattr,strvalue;
strfake = str.Right(n0-n1);
strfake.TrimLeft();
strfake.TrimRight();
int m0 = 0;
int m1 = 0;
int m2 = 0;
int mlen;
while(m0>-1)
{
mlen = strfake.GetLength();
m1 = strfake.Find(_T('\"'),m0);
if(m1<0)
break;
m2 = strfake.Find(_T('\"'),m1+1);
if(m1<0)
break;
strattr = strfake.Mid(m0,m1-m0-1);
strattr.Remove(_T('='));
strattr.MakeLower();
strattr.TrimLeft();
strattr.TrimRight();
strvalue = strfake.Mid(m1+1,m2-m1-1);
strvalue.TrimLeft();
strvalue.TrimRight();
int nd = attributes.FindKey(strattr);
if(nd<0)
attributes.Add(strattr,strvalue);
m0 = m2+1;
}
str = strnodename;
int size = attributes.GetSize();
for(int i=0;i<size;i++)
{
strattr = attributes.GetKeyAt(i);
strvalue = attributes.GetValueAt(i);
str += _T(" ");
str += strattr;
str += _T("=\"");
str += strvalue;
str += _T("\"");
}
attributes.RemoveAll();
}
return;
}
As above, you can see, I have recomposed all of the attr-value pair using CSimpleMap
. When encountered a duplicate pair, it will be overwritten. Another thing, in strings such as <a b="blah"c="interesting value">
, the space between "blah"
and c
has been added, or you cannot load successfully.
This code has been tested under XP, VS.NET 2002, WTL 71, ATL70. Any comment is appreciated.