Click here to Skip to main content
65,938 articles
CodeProject is changing. Read more.
Articles
(untagged)

The most minimized XML parser

0.00/5 (No votes)
11 May 2004 1  
An XML parser to substitute the big MSXML.

Introduction

XML is an essential data structure when you construct an object model. Without it, you can hardly do something hierarchical. MSXML is such a big DLL, and its version changes on and on, makes the installation not so easy. From the other side, I just need to parse a little document, and that is a little application, but I need to distribute a big dll. These made me want to reinvent the wheel.

After I finished my draft, I found there have been some successful projects about xml-parsing, such as TinyXml, simple STL based XML parser By David Hubbard, XMLite: simple XML parser by Cho, Kyung-min, etc. each one of them has their own features, and my requirement is something different from above mentioned, my code snippet wasn't involved with MFC, STL, and don't need to be run on other OSes.

Using the code

 
#pragma once
#define MAX_TEXT_LEN 0x400
#define MAX_NAME_LEN 0x100

class CAxNode
{
public:
  CAxNode() { parent = NULL; }
  CAxNode(CAxNode* p):parent(p){}
  ~CAxNode()
  {
    for(long i=0; i<childNodes.GetSize();i++)
      delete childNodes[i];
  }

  CComBSTR  elementType;
  CSimpleArray<CComBSTR> arrText;
  CSimpleMap<CComBSTR,CComBSTR> attrMap;
  CSimpleArray<CAxNode*> childNodes;
  CAxNode*   parent;
};

inline void FillSTR(LPTSTR& s,CComBSTR &x)
{
  TCHAR name[MAX_NAME_LEN];
  long j=0;
  while(name[j++]=*s++)    
  // should be char or dig, and not dig head

  if(*s==_T('\r')||*s==_T('\n')||
    *s==_T('\t')||*s==_T(' ')||*s==_T('>')||
    *s==_T('=')||*s==_T('<')||*s==_T('/'))
    break;

  name[j] =_T('\0');
  //CharLower(name);

  x = name;
}

inline void SkipFormatChar(LPCTSTR& s)
{
  while(*s++)
    if(*s!=_T('\r')&&*s!=_T('\n')&&*s!=_T('\t')&&*s!=_T(' '))
      break;
}
inline void ParseNode(CAxNode* Node, LPTSTR& s)
{
  while(*s++!=_T('<'));
  FillSTR(s,Node->elementType);

  TCHAR szText[MAX_TEXT_LEN];
  if(*s!=_T('>'))
  {
    // now spawn attr map

    SkipFormatChar(s);
    while(*s)
    {
      CComBSTR attr;
      FillSTR(s,attr);
      while(*s++!=_T('"'));

      long j = 0;
      while(*s!=_T('"')&&j<MAX_TEXT_LEN)
        szText[j++] = *s++;
      szText[j] = _T('\0');

      // remove duplicate attributes

      if(Node->attrMap.FindKey(attr)<0)
        Node->attrMap.Add(attr,szText);

      SkipFormatChar(s);
      if(*s==_T('/'))
      {
        while(*s++!='>');
        return;  // closeness of some self-closed tag

      }

      if(*s==_T('>'))  
        break;  // now process innertext

    }
  }

  // processing child nodes

  while(*s)
  {
    if(*s==_T('>'))  
      *s++;

    long j = 0;
    while(*s!=_T('<')&&j<MAX_TEXT_LEN) 
      szText[j++] = *s++;
    szText[j] = _T('\0');
    Node->arrText.Add(szText);
  
    SkipFormatChar(s);
    if(*s==_T('/')) 
    {
      SkipFormatChar(s);
      CComBSTR elementType;
      FillSTR(s,elementType);
      ATLASSERT(Node->elementType==elementType);
      return;
    }
    *s--;

    CAxNode* child = new CAxNode(Node);
    Node->childNodes.Add(child);
    ParseNode(child,s);
  }
  return;
}

#pragma warning(push)
#pragma warning(disable: 4244) 
inline void RemoveBlock(LPTSTR &s,LPCTSTR szleft,LPCTSTR szright)
{
  long i1,i2;
  LPTSTR s1,s2;
  while(1)
  {
    s1 = _tcsstr(s,szleft);
    if(s1==NULL)
      break;
    i1 = s1-s;
    if(i1<0) 
      break;
    s2 = _tcsstr(s,szright);
    i2 = s2-s+lstrlen(szright);
    if(i2<0) 
      break;

    ATLASSERT(i2>i1);
    for(int i=i1;i<i2;i++)
      s[i] = _T(' ');
  }
  return;
}
#pragma warning(pop)

This is all of the code, totally 135 lines. In order to use it, you just need the atlsimplecoll.h. If you don't have one, you can copy it from another box, this file has little dependency on other atl headers.

HANDLE hFile = ::CreateFile(testfile,GENERIC_READ,
  FILE_SHARE_WRITE,NULL,OPEN_EXISTING,NULL,NULL);
if (hFile == INVALID_HANDLE_VALUE)
  return -1;

ULARGE_INTEGER liFileSize;
liFileSize.LowPart = ::GetFileSize(hFile, &liFileSize.HighPart);
if (liFileSize.LowPart == 0xFFFFFFFF)
  return -1;

LPTSTR lpXML = 
 new TCHAR[((size_t)liFileSize.QuadPart)/sizeof(TCHAR)+1];

DWORD pdwRead;
BOOL b = ::ReadFile(hFile, lpXML, 
  (DWORD)liFileSize.QuadPart,&pdwRead,NULL);
lpXML[pdwRead/sizeof(TCHAR)]= _T('\0');
  
RemoveBlock(lpXML,_T("<?"),_t("?>"));
RemoveBlock(lpXML,_T("<!--"),_T("-->"));

CAxNode* pNode = new CAxNode(NULL);
  
LPTSTR xxx = lpXML;
DWORD dw1 = GetTickCount();
ParseNode(pNode,xxx);
DWORD dw2 = GetTickCount();  

delete [] lpXML;
  
// remove child at 0,0

//CAxNode* child = pNode->childNodes[0]->childNodes[0];

//delete child;

//pNode->childNodes[0]->childNodes.RemoveAt(0);


delete pNode;
::CloseHandle(hFile);  
  

History

If you read my last version, you can see I have changed it a lot, such as LPTSTR has been replaced by CComBSTR, this makes me omit simplearray and equalhelper. And a new constructor has been added, because of my personal usage, you can just remove it. In the last version, I add innertext only when its length greater than 0, this is not so reasonable, so I changed it.

The performance still needs a lot of work. if you don't need to change node items through the whole runtime, you can make the CComBSTR to be LPCTSTR, this will exhaust lower memory, and the performance will be improved accordingly.

The last word, and also the most important statement obviously you should take care of is, if you use my code in your commercial products, you certainly should give part of your profit to me! I am serious sir! Or I will call the police!

Anyway, comments are really appreciated, so I can make it better for every one of us. I am waiting for you right here.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here