Click here to Skip to main content
65,938 articles
CodeProject is changing. Read more.
Articles / Languages / XML

Simple XML Parser - Minimalistic Approach

3.00/5 (6 votes)
26 Feb 2008CPOL2 min read 1  
XML parser helper classes in pure C++ without validation

Introduction

Parsing XML documents in a pure C++ program may require huge class libraries or connecting, via COM, to MSXML parser. But once in a while, client programs are upgraded to use XML format for the short messages they send and receive. This article summarizes a minimalistic approach to XML parsing. We leave all verification for the server and the transport layer in an attempt to provide a pure C++, small and handy set of classes to work with small and simple XML messages.

The Code

Follow the single XmlParser.h file you need in your project. It uses the Microsoft secure extensions to the C run-time library that were introduced with Visual Studio 2005, if available. We use the STL string class, too.

C++
#include <string>
#include <string.h>

class XmlAttribute
{
  const char* pstr;
  public:
    XmlAttribute() : pstr(0) {}
    bool find(const char* element, const char* name) 
    {
      string attr = " ";
      attr += name;
      attr += "=\"";
      pstr = strstr(element, attr.c_str()); 
      if (pstr == 0)
        return false;

      pstr = strstr(pstr, "=\"") + 2;
      return true; 
    }

    operator DWORD() const { if (pstr == 0) return 0; return strtoul(pstr, NULL, 10); }
    operator int() const { if (pstr == 0) return 0; return atoi(pstr); }
    operator bool() const { if (pstr == 0) return false; 
	return _strnicmp(pstr, "true", 4) ? false : true; }
    operator float() const { if (pstr == 0) return 0; return atof(pstr); }
    operator string() const { if (pstr == 0) return string(); 
	return string(pstr, strchr(pstr, '"')-pstr); }
    template<size_t size> friend 	// cannot overload the normal assignment 
				// operator the way we need!
      const char* operator&=(char (&dest)[size], XmlAttribute attr) 
    {
      memset(dest, 0, size);
      if (attr.pstr != 0) 
#if (_MSC_VER >= 1400)
        strncpy_s(dest, size, attr.pstr, strcspn(attr.pstr, "\""));
#else
        strncpy(dest, attr.pstr, min(size, strcspn(attr.pstr, "\"")));
#endif        
      return &(dest[0]);
    }
}; 

class XmlElement
{
  const char* content;
  XmlAttribute anAttribute; // is reused

  public:
    class namestring: private string
    {
    public:
      namestring(const char* str)
      {
        if (str[0] == '<')
        {
          assign(str+1, strcspn(str+1, " ></\t"));
        }
      };
      operator const char*() const { return c_str(); };
      bool operator==(const char* str) const { return 0 == strcmp(c_str(), str); };
      bool operator!=(const char* str) const { return 0 != strcmp(c_str(), str); };
    };

    const namestring Name;
    XmlElement(const char* str) : content(str), Name(str) {}
    const XmlAttribute& GetAttribute(const char* name) 
	{ anAttribute.find(content, name); return anAttribute; }
    const char* GetChild(const char* name = 0) const
    {
      if (name == 0)
        return strchr(content, '>') + 1;
 
      string child = "<";
      child += name;
      child += " ";
      return strstr(content+1, child.c_str());
    }
    const char* GetSibling(const char* name = 0) const { return GetChild(name); }
};

We define two classes of which the user should be aware: XmlElement and XmlAttribute.

Here is an example of parsing a simple XML message. It also illustrates generation of such messages without big XML libraries:

C++
struct Base
{
  int b;
  virtual void ToXML(string &s) const = 0;
}

struct DerivedOne : public Base
{
  int d11;
  char d12[10];
  float d13;
  void ToXML(string &s) const;
  void ToXML(string &s, const char* name) const;
  void LoadXML(XmlElement& xmlmsg);
}

struct DerivedTwo : public Base
{
  DWORD d21;
  bool d22;
  char d23[200];
  void ToXML(string &s) const;
}

struct DerivedThree: public Base
{
  DerivedOne d31[3];
  bool d32;
  void ToXML(string &s) const;
}

const Base* CreateTypedMessage(const char* msgstring)
{
  _ASSERT(msgstring != 0 && strstr(msgstring, "<") != 0);
  XmlElement xmlmsg = strstr(msgstring, "<");

  const Base* msg = NULL;
  if (xmlmsg.Name == "DervedOne")
  {
    msg = new DerivedOne();
    msg->LoadXML(xmlmsg);
  }
  else if (xmlmsg.Name == "DervedTwo")
  {
    msg = new DerivedTwo();
    msg->b = xmlmsg.GetAttribute("b");
    msg->d21 = xmlmsg.GetAttribute("d21");
    msg->d22 = xmlmsg.GetAttribute("d22");
    strcpy(msg->d23, xmlmsg.GetChild());
  }
  else if (xmlmsg.Name == "DervedThree")
  {
    msg = new DerivedThree(); 
    msg->b = xmlmsg.GetAttribute("b");
    msg->d32 = xmlmsg.GetAttribute("d32");
    XmlElement xmlchild = xmlmsg.GetChild("d31");
    msg->d31[0].LoadXML(xmlchild);
    msg->d31[1].LoadXML(xmlchild.GetSibling("d31"));
    XmlElement xmlchild2 = xmlmsg.GetSibling("d31").GetSibling("d31");
    msg->d31[2].LoadXML(xmlchild2);
  }
}

void DerivedOne::LoadXML(XmlElement& xmlmsg)
{
    b = xmlmsg.GetAttribute("b");
    d11 = xmlmsg.GetAttribute("d11");
    d12 &= xmlmsg.GetAttribute("d12"); // note use of operator &=
    d13 = xmlmsg.GetAttribute("d11");
}

void DerivedOne::ToXML(string& s) const
{
  return ToXML(s, "<DerivedOne ");
}

void DerivedOne::ToXML(string& s, const char* name) const
{
  char buf[20];

  s += name;
  s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
  s += "d11="; sprintf(buf, "\"%d\" ", d11); s += buf;
  s += "d12=\""; s += d12; s += "\" ";
  s += "d13="; sprintf(buf, "\"%f\" ", d13); s += buf;
  s += " />";
}

void DerivedTwo::ToXML(string& s) const
{
  char buf[20];

  s += "<DerivedTwo ";
  s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
  s += "d21="; sprintf(buf, "\"%u\" ", d21); s += buf;

  s += "d22="; s += (d22 ? "\"true\"" : "\"false\""); s += buf;
  s += " />";
  s += d23;
  s += "</DerivedTwo>";
}
 
void DerivedThree::ToXML(string& s) const
{
  char buf[20];

  s += "<DerivedThree ";
  s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
  s += "d32="; s += (d32 ? "\"true\"" : "\"false\""); s += buf;
  s += " />";
  d31[0].ToXML(s, "d31 idx=\"0\" ");
  d31[1].ToXML(s, "d31 idx=\"1\" ");
  d31[2].ToXML(s, "d31 idx=\"2\" ");
  s += "</DerivedThree>";
}

Here are the messages in XML format, for reference:

XML
<DerivedOne b="1211" d11="-12" d12="abcd" d13="31.2" />

<DerivedTwo b="1212" d21="1212" d22="True">this is the text for d23</DerivedTwo>

<DerivedThree b="1213" d32="True">
  <d31 idx="0" b="1221" d11="-12" d12="abcd" d13="31.2" />
  <d31 idx="1" b="1231" d11="-12" d12="abcd" d13="31.2" />
  <d31 idx="2" b="1241" d11="-12" d12="abcd" d13="31.2" />
 </DerivedThree>

Points of Interest

Note that from our point of view, GetSibling() and GetChild() are synonyms. I strongly suggest that in your code that uses this class, you do not forget the semantic difference. The rules of C++ make it impossible to overload the operator =() the way we need it, therefore I am using an overloaded operator &=() instead.

I strongly advice to define LoadXML(XmlElement& xmlmsg) methods (as we do for class DerivedOne) instead of accessing the data members directly, as we do for DerivedTwo.

It is most natural to have XML encoded as UTF-8. But you should also take care of encoding < as &lt; and decoding it back. Here is a snippet of such encoder and decoder using Win32 API that works with wchar_t C string.

C++
void Encoder(const wchar* content, std::string& s)
{
  for (const wchar_t* pc = content; *pc; pc++)
  {
    if (!iswascii(*pc))
    {
      memset(buf, 0, sizeof(buf));
      if (WideCharToMultiByte(CP_UTF8, 0, pc, 1, buf, sizeof(buf), NULL, NULL))
        s += buf;
      else
        s += '?';
    }
    else if (*pc == '<')
      s += "<";
    else if (*pc == '>')
      s += ">";
    else if (*pc == '&')
      s += "&";
    else
      s += *pc;
  }
}

const wchar_t* Decoder(const XmlElement& xmlmsg)
{
  const char* pc = xmlmsg.GetChild();
  const int size = MultiByteToWideChar(CP_UTF8, 0, pc, strchr(pc, '<') - pc, 0, 0);

  wchar_t* content = 0; // if something goes wrong during conversion

  if (size > 0)
  {
    content = new wchar_t[size+1];
    if (size == MultiByteToWideChar
	(CP_UTF8, 0, pc, strchr(pc, '<') - pc, content, size))
    {
      content[size] = 0;
      wchar_t* pdst = content;

      for (const wchar_t* psrc = content; *psrc != 0; psrc++, pdst++)
      {
        if (wcsncmp(psrc, L"&", 5) == 0)
        {
          *pdst = L'&';
          psrc += 4;
        }
        else if (wcsncmp(psrc, L"<", 4) == 0)
        {
          *pdst = L'<';
          psrc += 3;
        }
        else if (wcsncmp(psrc, L">", 4) == 0)
        {
          *pdst = L'>';
          psrc += 3;
        }
        else
        {
          *pdst = *psrc;
        }
      }
    }
  }
  return content;
}

History

  • January 3rd, 2008 - Added Encoder and Decoder snippets

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)