I have a list of predefined regex pattern strings (around `7 thousand` type of regex pattern to group similar type of messages).
Now I have two set to list one for `regex patterns` and another for `real messages` which contains some variable names.
I need to group all the similar messages and show those grouped messages, Now I have traverse 7000 regex patterns to group similar items in 1000 messages. It takes `m*n iterations` to find the correct groups.
To reduce the time of the processing, I have removed matched items from the list of messages. e.g `1000 - (matched items on the previous iteration)`.
What I have tried:
It is taking too much long time to process these two lists. To reduce the time, I have grouped it on message category type and processing them in parallel tasks.
List<KBError> warningKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Warning").ToList();
List<KBError> fatalKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Fatal").ToList();
List<KBError> severeKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Severe").ToList();
List<KBError> cbeccErrorKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Error").ToList();
errors.RemoveAll(error => !processingErrorType.HasFlag(error.ErrorType));
List<Error> warningErrors = errors.Where(kbErr => kbErr.ErrorType == ErrorType.Warning).ToList();
List<Error> fatalErrors = errors.Where(kbErr => kbErr.ErrorType == ErrorType.Fatal).ToList();
List<Error> severeErrors = errors.Where(kbErr => kbErr.ErrorType == ErrorType.Severe).ToList();
List<Error> cbeccErrors = errors.Where(kbErr => kbErr.ErrorType ==ErrorType.Error).ToList();
After that these messages are processed in the parallel task by partitioning them in the equal subset of items.
Func<List<KBError>, List<Error>, List<Error>> FindDistinctErrorMessages = (filteredKBErros, filteredErros) =>
{
ConcurrentBag<Error> errorsList = new ConcurrentBag<Error>();
object lockObject = new object();
System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
sw.Start();
Parallel.For(0, filteredKBErros.Count,
() => new Dictionary<KBError, List<Error>>(),
(x, loopState, kpErrorResult) =>
{
kpErrorResult.Add(filteredKBErros[(int)x], filteredErros
.Where(error => Regex.IsMatch(error.ErrorMessage,
filteredKBErros[(int)x].ErrorMessage, System.Text.RegularExpressions.RegexOptions.IgnorePatternWhitespace)).ToList());
return kpErrorResult;
},
(kpErrorResult) =>
{
lock (lockObject)
{
foreach (KeyValuePair<KBError, List<Error>> errorResult in kpErrorResult)
{
if (errorResult.Value.Count > 0)
{
Error error = null;
if (errorResult.Value.Count == 1)
{
error = errorResult.Value.First();
}
else
{
error = new Error();
error.ErrorMessage = errorResult.Value.First().ErrorMessage;
error.Errors = errorResult.Value;
error.ErrorType = errorResult.Value.First().ErrorType;
}
error.ErrorCount = errorResult.Value.Count;
error.ErrorCode = errorResult.Key.ErrorCode;
AddErrorResolutionMessage(error, errorResult.Key);
error.ErrorMessagePattern = errorResult.Key.ErrorMessage;
errors.Add(error);
errorResult.Value.ForEach(err => errors.Remove(err));
}
}
}
}
);
sw.Stop();
System.Diagnostics.Debug.WriteLine(string.Format("Completed in {0} seconds", sw.Elapsed.TotalSeconds));
return errors.ToList();
};
List<KBError> filteredWarningKBList = FilterKBList(warningKBErrors, warningErrors);
List<KBError> filteredSevereKBList = FilterKBList(severeKBErrors, severeErrors);
List<KBError> filteredFatalKBList = FilterKBList(fatalKBErrors, fatalErrors);
List<KBError> filteredcbeccErrorsKBList = FilterKBList(cbeccErrorKBErrors, cbeccErrors);
List<Task<List<Error>>> tasks = new List<Task<List<Error>>>();
if (warningErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Warning) || processingErrorType.Equals(ErrorType.All)))
{
int equalCounts = warningErrors.Count < 10 ? 1 : warningErrors.Count / 10;
foreach (IEnumerable<Error> subSet in warningErrors.Split(equalCounts))
{
tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredWarningKBList, subSet.ToList()), CancellationToken.None));
}
}
if (severeErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Severe) || processingErrorType == ErrorType.All))
{
int equalCounts = severeErrors.Count < 10 ? 1 : severeErrors.Count / 10;
foreach (IEnumerable<Error> subSet in severeErrors.Split(equalCounts))
{
tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredSevereKBList, subSet.ToList()), CancellationToken.None));
}
}
if (fatalErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Fatal) || processingErrorType.Equals(ErrorType.All)))
{
int equalCounts = fatalErrors.Count < 10 ? 1 : fatalErrors.Count / 10;
foreach (IEnumerable<Error> subSet in fatalErrors.Split(equalCounts))
{
tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredFatalKBList, subSet.ToList()), CancellationToken.None));
}
}
if (cbeccErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Error) || processingErrorType.Equals(ErrorType.All)))
{
int equalCounts = cbeccErrors.Count < 10 ? 1 : cbeccErrors.Count / 10;
foreach (IEnumerable<Error> subSet in cbeccErrors.Split(equalCounts))
{
tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredcbeccErrorsKBList, subSet.ToList()), CancellationToken.None));
}
}
After starting these task, It takes lots of time to complete these tasks. wait statement for these created tasks somehow put the application in the hang state.
try
{
List<Error> result = new List<Error>();
Task.WaitAll(tasks.ToArray());
foreach (var task in tasks)
{
result.AddRange(task.Result);
}
result = result.Distinct().ToList();
result.GroupBy(res => res.ErrorMessagePattern).ToList()
.ForEach(grp =>
{
Error error = grp.First();
error.ErrorCount = grp.Sum(r => r.ErrorCount);
if (grp.Count() > 1)
{
grp.ToList().ForEach(grpElement =>
{
if (grpElement != error)
{
if (error.Errors == null)
error.Errors = new List<Error>();
grpElement.ErrorCount = 1;
if (grpElement.Errors != null && grpElement.Errors.Count > 0)
{
error.Errors.AddRange(grpElement.Errors);
grpElement.Errors = null;
}
}
});
}
distinctErrors.Add(error);
});
}
finally
{
}
errors.ForEach(error =>
{
error.ErrorCount = 1;
AddErrorResolutionMessage(error, null);
distinctErrors.Add(error);
if (error.PossibleResolution == "Not Found")
logMessage.AppendLine(error.ErrorMessage);
});
> Is there any better way or algorithm to reduce the time of processing
> these lists and reduce the time complexity of the process rather
> processing mxn elements?