Introduction
One of the most useful features of std::regex
is the ability to capture text. However, sometimes regex just doesn't cut it for more ambitious captures.
Example
#include <iostream>
#include <regex>
int main()
{
try
{
std::regex rx("([A-Z_a-z]\\w*)(?:\\s*,\\s*([A-Z_a-z]\\w*))*");
std::string input = "111 One, 2, Three, Four 222 Five,Six,Seven Eight, 9, 10";
std::cregex_iterator iter(input.c_str(), input.c_str() + input.size(), rx);
std::cregex_iterator end;
for (; iter != end; ++iter)
{
for (std::size_t i = 0, size = (*iter).size(); i < size; ++i)
{
const std::string str = (*iter)[i].str();
if (!str.empty()) std::cout << str << '\n';
}
std::cout << '\n';
}
}
catch (const std::exception &e)
{
std::cout << e.what() << '\n';
}
return 0;
}
This code outputs the following:
One
One
Three, Four
Three
Four
Five,Six,Seven
Five
Seven
Eight
Eight
The problem is with the case "Five,Six,Seven" as the regex is only able to capture "Five" and "Seven", omitting the "Six".
The Solution
#include "parsertl/enums.hpp"
#include "parsertl/generator.hpp"
#include <iostream>
#include "parsertl/search_iterator.hpp"
int main()
{
try
{
parsertl::rules grules(*parsertl::rule_flags::enable_captures);
parsertl::state_machine gsm;
lexertl::rules lrules;
lexertl::state_machine lsm;
grules.token("Name");
grules.push("list", "(Name) "
"| list ',' (Name)");
parsertl::generator::build(grules, gsm);
lrules.push("[A-Z_a-z]\\w*", grules.token_id("Name"));
lrules.push(",", grules.token_id("','"));
lrules.push("\\s+", lexertl::rules::skip());
lexertl::generator::build(lrules, lsm);
std::string input = "111 One, 2, Three, Four 222 Five,Six,Seven Eight, 9, 10";
lexertl::citerator liter(input.c_str(), input.c_str() + input.size(), lsm);
parsertl::csearch_iterator iter(liter, gsm);
parsertl::csearch_iterator end;
for (; iter != end; ++iter)
{
for (const auto &vec : *iter)
{
for (const auto &pair : vec)
{
std::cout << std::string(pair.first, pair.second) << '\n';
}
}
std::cout << '\n';
}
}
catch (const std::exception &e)
{
std::cout << e.what() << '\n';
}
return 0;
}
The above code output the following:
One
One
Three, Four
Three
Four
Five,Six,Seven
Five
Six
Seven
Eight
Eight
The first line in each block is the equivalent of $0
in regex parlance. The following lines are $1
and $2
. Each capture is a vector in order to support recursion, so in the example of "Five,Six,Seven" Six and Seven are stored under $2
and Five is stored under $1
.
History
22/03/2018 Created.
23/03/2018 Added regex example.
15/02/2024: Updated example to use parsertl17 syntax.