Introduction
Parsing XML documents in a pure C++ program may require huge class libraries or connecting, via COM, to MSXML parser. But once in a while, client programs are upgraded to use XML format for the short messages they send and receive. This article summarizes a minimalistic approach to XML parsing. We leave all verification for the server and the transport layer in an attempt to provide a pure C++, small and handy set of classes to work with small and simple XML messages.
The Code
Follow the single XmlParser.h file you need in your project. It uses the Microsoft secure extensions to the C run-time library that were introduced with Visual Studio 2005, if available. We use the STL string
class, too.
#include <string>
#include <string.h>
class XmlAttribute
{
const char* pstr;
public:
XmlAttribute() : pstr(0) {}
bool find(const char* element, const char* name)
{
string attr = " ";
attr += name;
attr += "=\"";
pstr = strstr(element, attr.c_str());
if (pstr == 0)
return false;
pstr = strstr(pstr, "=\"") + 2;
return true;
}
operator DWORD() const { if (pstr == 0) return 0; return strtoul(pstr, NULL, 10); }
operator int() const { if (pstr == 0) return 0; return atoi(pstr); }
operator bool() const { if (pstr == 0) return false;
return _strnicmp(pstr, "true", 4) ? false : true; }
operator float() const { if (pstr == 0) return 0; return atof(pstr); }
operator string() const { if (pstr == 0) return string();
return string(pstr, strchr(pstr, '"')-pstr); }
template<size_t size> friend const char* operator&=(char (&dest)[size], XmlAttribute attr)
{
memset(dest, 0, size);
if (attr.pstr != 0)
#if (_MSC_VER >= 1400)
strncpy_s(dest, size, attr.pstr, strcspn(attr.pstr, "\""));
#else
strncpy(dest, attr.pstr, min(size, strcspn(attr.pstr, "\"")));
#endif
return &(dest[0]);
}
};
class XmlElement
{
const char* content;
XmlAttribute anAttribute;
public:
class namestring: private string
{
public:
namestring(const char* str)
{
if (str[0] == '<')
{
assign(str+1, strcspn(str+1, " ></\t"));
}
};
operator const char*() const { return c_str(); };
bool operator==(const char* str) const { return 0 == strcmp(c_str(), str); };
bool operator!=(const char* str) const { return 0 != strcmp(c_str(), str); };
};
const namestring Name;
XmlElement(const char* str) : content(str), Name(str) {}
const XmlAttribute& GetAttribute(const char* name)
{ anAttribute.find(content, name); return anAttribute; }
const char* GetChild(const char* name = 0) const
{
if (name == 0)
return strchr(content, '>') + 1;
string child = "<";
child += name;
child += " ";
return strstr(content+1, child.c_str());
}
const char* GetSibling(const char* name = 0) const { return GetChild(name); }
};
We define two classes of which the user should be aware: XmlElement
and XmlAttribute
.
Here is an example of parsing a simple XML message. It also illustrates generation of such messages without big XML libraries:
struct Base
{
int b;
virtual void ToXML(string &s) const = 0;
}
struct DerivedOne : public Base
{
int d11;
char d12[10];
float d13;
void ToXML(string &s) const;
void ToXML(string &s, const char* name) const;
void LoadXML(XmlElement& xmlmsg);
}
struct DerivedTwo : public Base
{
DWORD d21;
bool d22;
char d23[200];
void ToXML(string &s) const;
}
struct DerivedThree: public Base
{
DerivedOne d31[3];
bool d32;
void ToXML(string &s) const;
}
const Base* CreateTypedMessage(const char* msgstring)
{
_ASSERT(msgstring != 0 && strstr(msgstring, "<") != 0);
XmlElement xmlmsg = strstr(msgstring, "<");
const Base* msg = NULL;
if (xmlmsg.Name == "DervedOne")
{
msg = new DerivedOne();
msg->LoadXML(xmlmsg);
}
else if (xmlmsg.Name == "DervedTwo")
{
msg = new DerivedTwo();
msg->b = xmlmsg.GetAttribute("b");
msg->d21 = xmlmsg.GetAttribute("d21");
msg->d22 = xmlmsg.GetAttribute("d22");
strcpy(msg->d23, xmlmsg.GetChild());
}
else if (xmlmsg.Name == "DervedThree")
{
msg = new DerivedThree();
msg->b = xmlmsg.GetAttribute("b");
msg->d32 = xmlmsg.GetAttribute("d32");
XmlElement xmlchild = xmlmsg.GetChild("d31");
msg->d31[0].LoadXML(xmlchild);
msg->d31[1].LoadXML(xmlchild.GetSibling("d31"));
XmlElement xmlchild2 = xmlmsg.GetSibling("d31").GetSibling("d31");
msg->d31[2].LoadXML(xmlchild2);
}
}
void DerivedOne::LoadXML(XmlElement& xmlmsg)
{
b = xmlmsg.GetAttribute("b");
d11 = xmlmsg.GetAttribute("d11");
d12 &= xmlmsg.GetAttribute("d12"); d13 = xmlmsg.GetAttribute("d11");
}
void DerivedOne::ToXML(string& s) const
{
return ToXML(s, "<DerivedOne ");
}
void DerivedOne::ToXML(string& s, const char* name) const
{
char buf[20];
s += name;
s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
s += "d11="; sprintf(buf, "\"%d\" ", d11); s += buf;
s += "d12=\""; s += d12; s += "\" ";
s += "d13="; sprintf(buf, "\"%f\" ", d13); s += buf;
s += " />";
}
void DerivedTwo::ToXML(string& s) const
{
char buf[20];
s += "<DerivedTwo ";
s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
s += "d21="; sprintf(buf, "\"%u\" ", d21); s += buf;
s += "d22="; s += (d22 ? "\"true\"" : "\"false\""); s += buf;
s += " />";
s += d23;
s += "</DerivedTwo>";
}
void DerivedThree::ToXML(string& s) const
{
char buf[20];
s += "<DerivedThree ";
s += "b="; sprintf(buf, "\"%d\" ", b); s += buf;
s += "d32="; s += (d32 ? "\"true\"" : "\"false\""); s += buf;
s += " />";
d31[0].ToXML(s, "d31 idx=\"0\" ");
d31[1].ToXML(s, "d31 idx=\"1\" ");
d31[2].ToXML(s, "d31 idx=\"2\" ");
s += "</DerivedThree>";
}
Here are the messages in XML format, for reference:
<DerivedOne b="1211" d11="-12" d12="abcd" d13="31.2" />
<DerivedTwo b="1212" d21="1212" d22="True">this is the text for d23</DerivedTwo>
<DerivedThree b="1213" d32="True">
<d31 idx="0" b="1221" d11="-12" d12="abcd" d13="31.2" />
<d31 idx="1" b="1231" d11="-12" d12="abcd" d13="31.2" />
<d31 idx="2" b="1241" d11="-12" d12="abcd" d13="31.2" />
</DerivedThree>
Points of Interest
Note that from our point of view, GetSibling()
and GetChild()
are synonyms. I strongly suggest that in your code that uses this class, you do not forget the semantic difference. The rules of C++ make it impossible to overload the operator =()
the way we need it, therefore I am using an overloaded operator &=()
instead.
I strongly advice to define LoadXML(XmlElement& xmlmsg)
methods (as we do for class DerivedOne
) instead of accessing the data members directly, as we do for DerivedTwo
.
It is most natural to have XML encoded as UTF-8. But you should also take care of encoding <
as <
and decoding it back. Here is a snippet of such encoder and decoder using Win32 API that works with wchar_t
C string.
void Encoder(const wchar* content, std::string& s)
{
for (const wchar_t* pc = content; *pc; pc++)
{
if (!iswascii(*pc))
{
memset(buf, 0, sizeof(buf));
if (WideCharToMultiByte(CP_UTF8, 0, pc, 1, buf, sizeof(buf), NULL, NULL))
s += buf;
else
s += '?';
}
else if (*pc == '<')
s += "<";
else if (*pc == '>')
s += ">";
else if (*pc == '&')
s += "&";
else
s += *pc;
}
}
const wchar_t* Decoder(const XmlElement& xmlmsg)
{
const char* pc = xmlmsg.GetChild();
const int size = MultiByteToWideChar(CP_UTF8, 0, pc, strchr(pc, '<') - pc, 0, 0);
wchar_t* content = 0;
if (size > 0)
{
content = new wchar_t[size+1];
if (size == MultiByteToWideChar
(CP_UTF8, 0, pc, strchr(pc, '<') - pc, content, size))
{
content[size] = 0;
wchar_t* pdst = content;
for (const wchar_t* psrc = content; *psrc != 0; psrc++, pdst++)
{
if (wcsncmp(psrc, L"&", 5) == 0)
{
*pdst = L'&';
psrc += 4;
}
else if (wcsncmp(psrc, L"<", 4) == 0)
{
*pdst = L'<';
psrc += 3;
}
else if (wcsncmp(psrc, L">", 4) == 0)
{
*pdst = L'>';
psrc += 3;
}
else
{
*pdst = *psrc;
}
}
}
}
return content;
}
History
- January 3rd, 2008 - Added
Encoder
and Decoder
snippets