it is a similarity matching code that matches the similarity b/w authors papers titles & titles of clusters(qurries ).but this code is running very slow.so kindly help me to optimize this code. only main () function needs to be change. plzzzzzzzzzzzzzzzz help
using System;
using System.IO;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Linq;
namespace VectorSpaceModel
{
class Program
{
static Hashtable DTVector = new Hashtable();
static List<string> wordlist = new List<string>();
static Dictionary<double,> sortedList1 = new Dictionary<double,>();
static Dictionary<string,> sortedList = new Dictionary<string,>();
static string[] docs = new string[37406];
static void Main(string[] args)
{
string fileName2 = @"F:\fyp\fnlfyp\vsm2\OSIM\vsm2\output_titles.txt";
string fileName3 = @"F:\fyp\fnlfyp\vsm2\OSIM\vsm2\queries.txt";
string fileName4 = @"F:\fyp\fnlfyp\vsm2\OSIM\vsm2\output_vsm.txt";
int num = 0;
string[] authors = new string[37406];
using (System.IO.StreamReader read_author = new System.IO.StreamReader(fileName2))
{
String line;
int j = 0;
j++;
while ((line = read_author.ReadLine()) != null)
{
string[] array = line.Split('=');
Console.WriteLine(j);
authors[j] = array[0];
docs[j] = array[1];
j++;
}
}
using (System.IO.StreamReader sr1 = new System.IO.StreamReader(fileName3))
{
String line2 = "";
while ((line2 = sr1.ReadLine()) != null)
{
docs[0] = line2;
num++;
createWordList();
createVector();
classify();
var dict = sortedList;
using (System.IO.StreamWriter writer = new System.IO.StreamWriter(fileName4, true))
{
writer.WriteLine(num);
writer.WriteLine(line2);
foreach (var x in dict.Reverse())
{
Console.WriteLine("Doc{1} -> {0}", x.Value, x.Key);
writer.WriteLine("Doc{1} -> {0}", x.Value, x.Key);
}
writer.WriteLine("");
}
docs.LastOrDefault();
DTVector.Clear();
wordlist.Clear();
sortedList.Clear();
}
}
}
public static void createWordList()
{
foreach (string doc in docs)
{
wordlist = getWordList(wordlist, doc);
}
}
public static List<string> getWordList(List<string> wordlist, string query)
{
Regex exp = new Regex("\\w+", RegexOptions.IgnoreCase);
MatchCollection MCollection = exp.Matches(query);
foreach (Match match in MCollection)
{
if (!wordlist.Contains(match.Value))
{
wordlist.Add(match.Value);
}
}
return wordlist;
}
public static void createVector()
{
double[] queryvector;
for (int j = 0; j < docs.Length; j++)
{
queryvector = new double[wordlist.Count];
for (int i = 0; i < wordlist.Count; i++)
{
double tfIDF = getTF(docs[j], wordlist[i]) * getIDF(wordlist[i]);
queryvector[i] = tfIDF;
}
if (j == 0)
{
DTVector.Add("Query", queryvector);
}
else
{
DTVector.Add(j.ToString(), queryvector);
}
}
}
public static void classify()
{
double temp = 0.0;
IDictionaryEnumerator _enumerator = DTVector.GetEnumerator();
double[] queryvector = new double[wordlist.Count];
Array.Copy((double[])DTVector["Query"], queryvector, wordlist.Count);
while (_enumerator.MoveNext())
{
if (_enumerator.Key.ToString() != "Query")
{
temp = cosinetheta(queryvector, (double[])_enumerator.Value);
if(temp != 0)
{
sortedList.Add(_enumerator.Key.ToString(), temp);
}
}
}
}
public static double dotproduct(double[] v1, double[] v2)
{
double product = 0.0;
if (v1.Length == v2.Length)
{
for (int i = 0; i < v1.Length; i++)
{
product += v1[i] * v2[i];
}
}
return product;
}
public static double vectorlength(double[] vector)
{
double length = 0.0;
for (int i = 0; i < vector.Length; i++)
{
length += Math.Pow(vector[i], 2);
}
return Math.Sqrt(length);
}
private static double getTF(string document, string term)
{
string[] queryTerms = Regex.Split(document, "\\s");
double count = 0;
foreach (string t in queryTerms)
{
if (t == term)
{
count++;
}
}
return count;
}
private static double getIDF(string term)
{
double df = 0.0;
for (int i = 1; i < docs.Length; i++)
{
if (docs[i].Contains(term))
{
df++;
}
}
double D = docs.Length - 1;
double IDF = 0.0;
if (df > 0)
{
IDF = Math.Log(D / df);
}
return IDF;
}
public static double cosinetheta(double[] v1, double[] v2)
{
double lengthV1 = vectorlength(v1);
double lengthV2 = vectorlength(v2);
double dotprod = dotproduct(v1, v2);
if (lengthV1 != 0)
return dotprod / (lengthV1 * lengthV2);
else
return 0;
}
}
}