Click here to Skip to main content
65,938 articles
CodeProject is changing. Read more.
Articles / web / HTML

General string parsing techniques

4.00/5 (1 vote)
30 Oct 2011CPOL 35.3K  
General string parsing techniques
General string parsing tools. These are handy because it is ergonomic to think " I want between this and that".

I use this to parse HTML quickly. Here is an example to find all image id elements from a page. If you look in my article[^], I have an XML parser based on this.

C#
int outLastMatch =0;
           for(;;)
           {
               string result = StringUtilities.GetBetween(html.ToString(), "<img id", "/>", outLastMatch, out outLastMatch);
               if (result == String.Empty)
               {
                   break;
               }

               removeList.Add(result);
           }


C#
public class StringUtilities
    {
        /// <summary>
        /// Gets a string between two strings ( strStart and StrEnd )
        /// You can use this in a loop by using startMatch-lastMatch to get all matches.
        /// Can be used to parse XML in limited cases
        /// </summary>
        /// <param name="source"></param>
        /// <param name="start"></param>
        /// <param name="end"></param>
        /// <param name="startMatch"></param>
        /// <param name="lastMatch"></param>
        /// <returns></returns>
        public static string GetBetween(string source, string start, string end, int startMatch, out int lastMatch)
        {
            int startIndex = source.IndexOf(start, startMatch);
            if (startIndex == -1)
            {
                lastMatch = -1;
                return String.Empty;
            }
            int endIndex = source.IndexOf(end, startIndex + start.Length + 1);
            if (endIndex == -1)
            {
                lastMatch = -1;
                return String.Empty;
            }
            lastMatch = endIndex;
            endIndex -= startIndex;
            endIndex += end.Length;
            return source.Substring(startIndex, endIndex);
        }

        /// <summary>
        /// See GetBetween. Does not return string for speed
        /// Can be used to parse XML in limited cases
        /// </summary>
        /// <param name="source">source to look in</param>
        /// <param name="start">beginning token</param>
        /// <param name="searchEnd">end token </param>
        /// <param name="begin">what index to start looking at</param>
        /// <param name="end">the index of the match found</param>
        /// <returns>String.Empty on failure, matching string on success</returns>
        public static void GetBetweenExcludeTokens(string source, string start, string searchEnd, ref int begin, out int end)
        {
            end = -1;
            begin = source.IndexOf(start, begin) + start.Length;
            if (begin == -1)
            {
                return;
            }
            end = source.IndexOf(searchEnd, begin);
        }

        /// <summary>
        /// See GetBetween. Does not return strStart and strEnd tokens
        /// Can be used to parse XML in limited cases
        /// </summary>
        /// <param name="source">source to look in</param>
        /// <param name="start">beginning token</param>
        /// <param name="end">end token </param>
        /// <param name="startMatch">startindex of search found </param>
        /// <param name="lastMatch">index of match found</param>
        /// <returns>String.Empty on failure, matching string on success</returns>
        public static string GetBetweenExcludeTokens(string source, string start, string end, int startMatch, out int lastMatch)
        {
            lastMatch = -1;
            int startIndex = source.IndexOf(start, startMatch);
            if ( startIndex == -1 )
            {
                return String.Empty;
            }
            startIndex += start.Length;

            lastMatch = source.IndexOf(end, startIndex);
            if ( lastMatch == -1 )
            {
                return String.Empty;
            }
            return source.Substring(startIndex, lastMatch - startIndex);
        }

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)