Introduction
XML is an essential data structure when you construct an object model. Without
it, you can hardly do something hierarchical. MSXML is such
a big DLL, and its version changes on and on, makes the installation not
so easy. From the other side, I just need to parse a little document,
and that is a little application, but I need to distribute a big dll. These made me
want to reinvent the wheel.
After I finished my draft, I found there have been some successful projects
about xml-parsing, such as TinyXml, simple STL based XML parser By David
Hubbard, XMLite: simple XML parser by Cho, Kyung-min, etc. each one of them has
their own features, and my requirement is something different from above
mentioned, my code snippet wasn't involved with MFC, STL, and don't need to be run on
other OSes.
Using the code
#pragma once
#define MAX_TEXT_LEN 0x400
#define MAX_NAME_LEN 0x100
class CAxNode
{
public:
CAxNode() { parent = NULL; }
CAxNode(CAxNode* p):parent(p){}
~CAxNode()
{
for(long i=0; i<childNodes.GetSize();i++)
delete childNodes[i];
}
CComBSTR elementType;
CSimpleArray<CComBSTR> arrText;
CSimpleMap<CComBSTR,CComBSTR> attrMap;
CSimpleArray<CAxNode*> childNodes;
CAxNode* parent;
};
inline void FillSTR(LPTSTR& s,CComBSTR &x)
{
TCHAR name[MAX_NAME_LEN];
long j=0;
while(name[j++]=*s++)
if(*s==_T('\r')||*s==_T('\n')||
*s==_T('\t')||*s==_T(' ')||*s==_T('>')||
*s==_T('=')||*s==_T('<')||*s==_T('/'))
break;
name[j] =_T('\0');
x = name;
}
inline void SkipFormatChar(LPCTSTR& s)
{
while(*s++)
if(*s!=_T('\r')&&*s!=_T('\n')&&*s!=_T('\t')&&*s!=_T(' '))
break;
}
inline void ParseNode(CAxNode* Node, LPTSTR& s)
{
while(*s++!=_T('<'));
FillSTR(s,Node->elementType);
TCHAR szText[MAX_TEXT_LEN];
if(*s!=_T('>'))
{
SkipFormatChar(s);
while(*s)
{
CComBSTR attr;
FillSTR(s,attr);
while(*s++!=_T('"'));
long j = 0;
while(*s!=_T('"')&&j<MAX_TEXT_LEN)
szText[j++] = *s++;
szText[j] = _T('\0');
if(Node->attrMap.FindKey(attr)<0)
Node->attrMap.Add(attr,szText);
SkipFormatChar(s);
if(*s==_T('/'))
{
while(*s++!='>');
return;
}
if(*s==_T('>'))
break;
}
}
while(*s)
{
if(*s==_T('>'))
*s++;
long j = 0;
while(*s!=_T('<')&&j<MAX_TEXT_LEN)
szText[j++] = *s++;
szText[j] = _T('\0');
Node->arrText.Add(szText);
SkipFormatChar(s);
if(*s==_T('/'))
{
SkipFormatChar(s);
CComBSTR elementType;
FillSTR(s,elementType);
ATLASSERT(Node->elementType==elementType);
return;
}
*s--;
CAxNode* child = new CAxNode(Node);
Node->childNodes.Add(child);
ParseNode(child,s);
}
return;
}
#pragma warning(push)
#pragma warning(disable: 4244)
inline void RemoveBlock(LPTSTR &s,LPCTSTR szleft,LPCTSTR szright)
{
long i1,i2;
LPTSTR s1,s2;
while(1)
{
s1 = _tcsstr(s,szleft);
if(s1==NULL)
break;
i1 = s1-s;
if(i1<0)
break;
s2 = _tcsstr(s,szright);
i2 = s2-s+lstrlen(szright);
if(i2<0)
break;
ATLASSERT(i2>i1);
for(int i=i1;i<i2;i++)
s[i] = _T(' ');
}
return;
}
#pragma warning(pop)
This is all of the code, totally 135 lines. In order to use it, you just need
the atlsimplecoll.h. If you don't have one, you can copy it from another box,
this file has little dependency on other atl headers.
HANDLE hFile = ::CreateFile(testfile,GENERIC_READ,
FILE_SHARE_WRITE,NULL,OPEN_EXISTING,NULL,NULL);
if (hFile == INVALID_HANDLE_VALUE)
return -1;
ULARGE_INTEGER liFileSize;
liFileSize.LowPart = ::GetFileSize(hFile, &liFileSize.HighPart);
if (liFileSize.LowPart == 0xFFFFFFFF)
return -1;
LPTSTR lpXML =
new TCHAR[((size_t)liFileSize.QuadPart)/sizeof(TCHAR)+1];
DWORD pdwRead;
BOOL b = ::ReadFile(hFile, lpXML,
(DWORD)liFileSize.QuadPart,&pdwRead,NULL);
lpXML[pdwRead/sizeof(TCHAR)]= _T('\0');
RemoveBlock(lpXML,_T("<?"),_t("?>"));
RemoveBlock(lpXML,_T("<!--"),_T("-->"));
CAxNode* pNode = new CAxNode(NULL);
LPTSTR xxx = lpXML;
DWORD dw1 = GetTickCount();
ParseNode(pNode,xxx);
DWORD dw2 = GetTickCount();
delete [] lpXML;
delete pNode;
::CloseHandle(hFile);
History
If you read my last version, you can see I have changed it a lot, such as
LPTSTR
has been replaced by CComBSTR
, this makes me omit simplearray
and equalhelper. And
a new constructor has been added, because of my personal usage, you can just
remove it. In the last version, I add innertext only when its length greater
than 0, this is not so reasonable, so I changed it.
The performance still needs a lot of work. if you don't need to change node
items through the whole runtime, you can make the CComBSTR
to be
LPCTSTR
, this
will exhaust lower memory, and the performance will be improved accordingly.
The last word, and also the most important statement obviously you
should take care of is, if you use my code in your commercial products,
you certainly should give part of your profit to me! I am
serious
sir! Or I will call the police!
Anyway, comments are really appreciated, so I can make it better for every
one of us. I am waiting for you right here.