|
Hello Luc,
thanks for replying
Now I have a little problem. I never learned VB.net, (somehow I even wonder how I was able to write a program like that, lol) I browsed the www and looked for the infos I need, experimented with code snippets and everything works like a charm, except the current problem where I want to make it a bit faster using 2 threads.
I updated the start post before I saw your reply.
My computer's a quadcore/iQ6600.
While the 2 threads are working (even if it's the single-thread methode) my hdd-LED doesn't flash much, I mean it's only flickering now and then, so I guess it doesn't put much load on the hdd.
hm I hope it's ok to paste the code here:
Private hashdups_m As Thread
Private hashdups_t1 As Thread
Private hashdups_t2 As Thread
Private Sub hash_folder()
Dim di As New IO.DirectoryInfo(src_cmp_fldr)
Dim aryFi As IO.FileInfo() = di.GetFiles("*.*")
If di.GetFiles("*.*").Length.ToString <> 0 Then
di_hashdups = New IO.DirectoryInfo(src_cmp_fldr)
aryFi_hashdups = di_hashdups.GetFiles("*.*")
hashdups_m = New Thread(AddressOf hashdups_main)
hashdups_m.Start()
End If
End Sub
Private Sub hashdups_main()
progb2val_a = 0
progb2val_b = 0
hashdups_t1 = New Thread(AddressOf hashdups_hash1)
hashdups_t2 = New Thread(AddressOf hashdups_hash2)
If aryFi_hashdups.Count = 1 Then
progb2val_b = 50
hashdups_t1.Start()
hashdups_t1.Join()
End If
If aryFi_hashdups.Count > 1 Then
hashdups_t1.Start()
hashdups_t2.Start()
hashdups_t1.Join()
hashdups_t2.Join()
End If
If hashdups_t1.IsAlive = False And hashdups_t2.IsAlive = False Then
ds_db_cmp_fldr.Tables.Add("db_cmp_fldr")
dt_db_cmp_fldr = ds_db_cmp_fldr.Tables("db_cmp_fldr")
Dim col_fs_dec As DataColumn = New DataColumn("filesize")
col_fs_dec.DataType = System.Type.GetType("System.Decimal")
dt_db_cmp_fldr.Columns.Add(col_fs_dec)
Dim col_sha1_string As DataColumn = New DataColumn("sha256hash")
col_sha1_string.DataType = System.Type.GetType("System.String")
dt_db_cmp_fldr.Columns.Add(col_sha1_string)
Dim col_fn_string As DataColumn = New DataColumn("filename")
col_fn_string.DataType = System.Type.GetType("System.String")
dt_db_cmp_fldr.Columns.Add(col_fn_string)
If ds_db_cmp_fldr.Tables.Contains("db_cmp_fldr_a") = True Then
For Each drSource_a As DataRow In dt_db_cmp_fldr_a.Rows
dt_db_cmp_fldr.ImportRow(drSource_a)
Next
ds_db_cmp_fldr.Tables.Remove("db_cmp_fldr_a")
End If
If ds_db_cmp_fldr.Tables.Contains("db_cmp_fldr_b") = True Then
For Each drSource_b As DataRow In dt_db_cmp_fldr_b.Rows
dt_db_cmp_fldr.ImportRow(drSource_b)
Next
ds_db_cmp_fldr.Tables.Remove("db_cmp_fldr_b")
End If
Me.Invoke(New upd_hashdups_m(AddressOf upd_hashdups_m_val), 1)
End If
End Sub
Private Sub hashdups_hash1()
ds_db_cmp_fldr.Tables.Add("db_cmp_fldr_a")
dt_db_cmp_fldr_a = ds_db_cmp_fldr.Tables("db_cmp_fldr_a")
Dim col_fs_dec_a As DataColumn = New DataColumn("filesize")
col_fs_dec_a.DataType = System.Type.GetType("System.Decimal")
dt_db_cmp_fldr_a.Columns.Add(col_fs_dec_a)
Dim col_sha1_string_a As DataColumn = New DataColumn("sha256hash")
col_sha1_string_a.DataType = System.Type.GetType("System.String")
dt_db_cmp_fldr_a.Columns.Add(col_sha1_string_a)
Dim col_fn_string_a As DataColumn = New DataColumn("filename")
col_fn_string_a.DataType = System.Type.GetType("System.String")
dt_db_cmp_fldr_a.Columns.Add(col_fn_string_a)
Dim myNewRow As DataRow
Dim aryficnt_a As Integer = Math.Ceiling(aryFi_hashdups.Count / 2)
For r1 = 0 To aryficnt_a - 1
result = Nothing
hashclass.HashFile(aryFi_hashdups(r1).FullName)
myNewRow = dt_db_cmp_fldr_a.NewRow()
myNewRow("filesize") = aryFi_hashdups(r1).Length
myNewRow("sha256hash") = result
myNewRow("filename") = aryFi_hashdups(r1).Name
dt_db_cmp_fldr_a.Rows.Add(myNewRow)
progb2val_a = (50 * (r1 + 1) / aryficnt_a)
Me.Invoke(New upd_progbar2_hash(AddressOf progbar2_val_hash), Convert.ToString(0))
Next
Me.Invoke(New upd_progbar2_hash(AddressOf progbar2_val_hash), Convert.ToString(0))
End Sub
Private Sub hashdups_hash2()
ds_db_cmp_fldr.Tables.Add("db_cmp_fldr_b")
dt_db_cmp_fldr_b = ds_db_cmp_fldr.Tables("db_cmp_fldr_b")
Dim col_fs_dec_b As DataColumn = New DataColumn("filesize")
col_fs_dec_b.DataType = System.Type.GetType("System.Decimal")
dt_db_cmp_fldr_b.Columns.Add(col_fs_dec_b)
Dim col_sha1_string_b As DataColumn = New DataColumn("sha256hash")
col_sha1_string_b.DataType = System.Type.GetType("System.String")
dt_db_cmp_fldr_b.Columns.Add(col_sha1_string_b)
Dim col_fn_string_b As DataColumn = New DataColumn("filename")
col_fn_string_b.DataType = System.Type.GetType("System.String")
dt_db_cmp_fldr_b.Columns.Add(col_fn_string_b)
Dim myNewRow As DataRow
Dim aryficnt_b As Integer = Math.Ceiling(aryFi_hashdups.Count / 2)
For r2 = aryficnt_b To aryFi_hashdups.Count - 1
result = Nothing
hashclass.HashFile(aryFi_hashdups(r2).FullName)
myNewRow = dt_db_cmp_fldr_b.NewRow()
myNewRow("filesize") = aryFi_hashdups(r2).Length
myNewRow("sha256hash") = result
myNewRow("filename") = aryFi_hashdups(r2).Name
dt_db_cmp_fldr_b.Rows.Add(myNewRow)
progb2val_b = (50 * (r2 - aryficnt_b + 1) / aryficnt_b)
Me.Invoke(New upd_progbar2_hash(AddressOf progbar2_val_hash), Convert.ToString(0))
Next
Me.Invoke(New upd_progbar2_hash(AddressOf progbar2_val_hash), Convert.ToString(0))
End Sub
the hashclass:
Public Class hashclass
Public Shared Function HashFile(ByVal File As String) As String
Dim FN As New FileStream(File, FileMode.Open, FileAccess.Read, FileShare.Read, 8192)
Dim HashValue(0) As Byte
Dim Tmp As String = ""
Dim SHA256 As New SHA256Managed
SHA256.ComputeHash(FN)
HashValue = SHA256.Hash
FN.Close()
For i As Integer = 0 To HashValue.Length - 1
Tmp = Hex(HashValue(i))
If Len(Tmp) = 1 Then Tmp = "0" & Tmp
result += Tmp
Next
Return result
End Function
End Class
Maybe there's an easier way than I found, but it works
The single thread version nearly looks the same, it skips making 2 tables and later reading both into a new table. Also the FOR NEXT loop is just from 0 to aryFi_hashdups.Count - 1
I just don't see why it varies like I wrote in the startpost.
If you need any other infos, please ask
Thanks,
Nik
|
|
|
|
|
Hi Nik,
I'm sorry to say but that code is rather horrible. I haven't studied it all, however here are first comments:
1.
there is way too much code; from your description, I would estimate the hashing should take no more than 20 lines of code.
2.
please check how often you call GetFiles(). You really need only one of those; calling it more than once wastes a lot of cycles and bytes, and does not guarantee all results will be identical, as your file system is a live object itself.
3.
you really should separate the functionality (calculate hash on a number of files) from other issues, such as the way you want to store things (DataTable).
4.
IIRC you did not mention progress bars before; they are the first suspect for the slowness. You did use Invoke (which you must), however this causes two thread switches (to the GUI thread and back), slowing down everything by a large factor.
This is how I would do this (initial approach):
1. create a little class "Result" holding filename, filesize, hashvalue
2. create a class "Job", holding a List of strings for filenames, a List of Results, and some code.
Necessary code would be:
2a. a Launch method that creates a thread which starts enumerating filenames (from its own input List), and produces results (in its own List of Results)
2b. a Join method that waits for termination, using the Thread.Join method.
3. in the main code:
3a. create N Job instances (assume N is 1 or 2) and keep them in a List of Job
3b. get a collection of all the files, and distribute them over the List of filenames in all the jobs
3c. launch the jobs
3d. wait on all jobs done
3e. go collect the results of all the jobs and post-process them
If you want progrss indication:
- give each Job a filesDone counter;
- add a System.Windows.Forms.Timer with Interval=100 that enumerates all jobs, totalizes their filesDone value, and sets that as the ProgressBar,Value (where ProgressBar.Maximum was set to the number of files returned by the initial GetFiles)
I know, all the above really says is: throw away what you have; start all over, and keep things as clean as possible.
Good luck!
PS: the above "first approach" is not optimal; there are a couple of ways to improve it, but it would be quite good if you tried it like that before going for gold.
|
|
|
|
|
Hey Luc,
thanks, long reply
to be honest, it made me dizzy, right now I have no idea where to start or how to do it ...
I have to look into it and google some more ...
and you're right, there's one getfiles that can be removed because it's unused and one that I can replace by the 2nd one. I didn't notice that yet.
though the files in the folder (and the file count) won't change unless I change them manually.
I didn't think of the progressbar, I thought since I use invoke and due to my missing knowledge that it won't take time away.
So for now I'll start with trying to implement the timer instead of using invoke
Nik
|
|
|
|
|
good.
if you just remove the progress stuff for now, you can judge how much it did affect the overall speed.
then, add proper progress reporting; when done right, it should not influence speed at all.
|
|
|
|
|
NikWing wrote: result = Nothing hashclass.HashFile(aryFi_hashdups(r2).FullName)
myNewRow = dt_db_cmp_fldr_b.NewRow()
myNewRow("filesize") = aryFi_hashdups(r2).Length
myNewRow("sha256hash") = result
I had a closer look at this part of your code; how is result getting its value here?
HashFile is a method that returns "result" but the caller is not storing it at all?
And inside HashFile the variable result is not declared and not initialized. This is all wrong.
|
|
|
|
|
Thanks for your time, Luc
no no, I just didn't paste the code you're missing
the program is about 4200 lines right now so I kinda forgot to paste this:
Public Module Variables_form1
Friend result As String = ""
End Module
so I set result to nothing, call the hashfile function with the full filename, it returns the hash and stores it in a new datatable row
like I said, the original program runs without a flaw, just the double-thread stuff breaks it somehow ...
I gave up for today 10 mins ago.
the timer instead of invoke didn't speed it up.
the progressbar value is between 96% and 100%, usually 99%.
progbar.Maximum value is aryFi_hashdups.count, current value is the sum of each FOR .. NEXT counter, ((r1) + (r2 - aryficnt_b + 1)) where aryficnt_b = Math.Ceiling(aryFi_hashdups.Count / 2)
But beside this, the tables are broken somehow
I thought the problem occurs when I copy each table row by row to a new table
but both have the correct amount of lines, as in, I hashed 838 files, each table contains 419 rows
I skipped making a new table out of table_a and table_b and altered the compare part to use table_a, then table_b.
the problem still occurs, the number of results varies. I tried it with my single thread version, the result won't vary and it's correct.
I now try to find out if the aryFi_hashdups array varies. I can't imagine this, why should it alter itself while the program runs? it's just a list of filenames as far as I know (di.getfiles(*.*), and after it's created nothing changes it ...)
I divide it's count by 2, round one to floor, one to ceiling and have 2 loops
Dim aryficnt_a As Integer = Math.Floor(aryFi_hashdups.Count / 2)
For r1 = 0 To aryficnt_a - 1
Next
and
Dim aryficnt_b As Integer = Math.Ceiling(aryFi_hashdups.Count / 2)
For r2 = aryficnt_b To aryFi_hashdups.Count - 1
Next
I just don't understand, I don't see anything that interferes ...
edit: uh, just thought of something ... could the problem be that I only have one hash function and also just one "result" ?
I'll copy the hash function and change result to result_a and result_b and see if that works ...
I will try that now, though it's already 1:25 am ^^
edit2: I guess that was the problem of the wrong results. now the result won't vary anymore. I also fixed the progressbar value, I have to make more tests with different files, but for now it finishes with 100%
with taskman I monitored the cpu usage. without visible caching (1st run) it uses between 2 and 5% of CPU, no idea why it doesn't use more. the hdd isn't very active, only flashing now and then.
WITH visible caching (2nd run etc) it uses 50% of the CPU and of course is much faster.
I have to compare both versions of the code again, how much time each takes for 1000 files and if anything improved ...
modified on Thursday, May 6, 2010 7:32 PM
|
|
|
|
|
NikWing wrote: Public Module Variables_form1
Friend result As String = "" End Module
OK, that is completely wrong and would explain why some of the hash values are off. You only have one "result" variable, yet two threads are writing to it and reading from it. So the order of operations will be determined by chance, and at some point in time it could be:
thread1 writes 111
thread2 writes 222
thread2 reads, hence gets 222
thread1 reads, hence gets 222 instead of 111
furthermore, while executing, the HashFile function constantly changes "result". So it is almost by accident that some of your hashes end up being correct!
Your HashFile function returns the result, no need to use an external variable at all. Here is what you do:
remove Friend result As String = ""
add a result variable inside HashFile
where you call HashFile, do it like so: Dim result as Long=...HashFile...
so now each thread receives the correct value and stores it locally.
Haven't read most of your message yet. May have more to comment later.
|
|
|
|
|
alright
did it and it still works
(though I had to use String instead of Long because of hex conversation errors)
that's the problem when you (I mean myself) don't know what you're doing.
searching for infos on the WWW might result in something that works, but it's hard to see if it couldn't be way better, safer, faster ...
Since I used another class I couldn't just Dim result like other variables, so I googled and found the way with Friend, I didn't know how to use the Return value
Since I edited the code using what you suggested, do I still need 2 hash functions? for testing it I copied the function and now have hashfile_a and hashfile_b.
Would having just 1 function matter if 2 threads access it? I'm not sure but I would say it doesn't matter?
|
|
|
|
|
NikWing wrote: I had to use String instead of Long
right, I hadn't spotted yet your hash is a string; when I calculate hashes, I use int or long, that works much faster, however SHA256 returns a byte array, so you want to turn that into one thing and chose a string; I still might turn those 16? bytes into one long, but that isn't really relevant.
anyway, a double advice: start studying from a real source, i.e. a book; and then solve your problem with much much much less code. It is complete non-sense to write 4000+ lines of code while not knowing the fundamentals of the language.
|
|
|
|
|
I'll try to
by the way, I just took the operation time again.
1000 files each time, non-identical, 1st run, so no caching.
single thread: 35 seconds
dual-thread: 54 seconds ...
it's ... just ... odd ...
oh well, bed now, it's 3 am.
|
|
|
|
|
It's not that odd when you consider that starting a thread is a very expensive operation by itself. And, judging by the code that has been written and discussed with Luc, it's not written with threading in mind. Just launching a second copy of the same code you wrote for a single threaded operation in another thread does not mean it's correct for multiple thread operation. Much greater care must be taken to make sure you don't introduce resource contention bottlenecks (access collections, I/O operations, ...) and situations where two or more threads are all waiting on each other to give up control, called a deadlock, as well as others.
Chances are your single-threaded code has to be scrapped and completely rewritten if you want to support multiple threaded operation. And even then, that's no guarantee your entire operation is going to run faster than the single threaded version. You also have to look at what you're doing. Keep in mind that your every day disk can't read more than one part of a disk at any one time, so, depending on your software design, only one thread will get data at any one time. Your algorithm could conceivably thrash the disk, moving the heads back and forth between two areas of the disk constantly, trashing the disk cache and spending more time seeking the heads than reading a segment of data.
|
|
|
|
|
Hello Dave
I just read more into asynchronous file reading, which seems to be a good way to prevent context switching etc.
this might be a way to go, though I don't yet see how to implement it since it's using a callback delegate and StateObj.
So IF this is a better/faster way (like it's described here: http://www.xtremevbtalk.com/showthread.php?t=195997[^]) I just have to find out how to get it to make the hash and return it to where it was initiated, so it finally can be added to the datatable ...
|
|
|
|
|
OK, I looked at some more and now I feel sick.
You have over 4000 lines of such code? I suggest you stop right away, buy a book on VB.NET, and study it (you may want to skip chapters on topics that are not of interest yet, e.g. networking, database, etc). After studying say one week, sit down and create your app from scratch. It may sound harsh, it will pay off pretty soon though.
|
|
|
|
|
LOL
well, this function is only one part of my program.
it's mainly for picture management.
one part is a duplicate file finder that 1st sorts out files that exist only once, then compares the left over file size-identical files by their hashes, display the results in a datagridview with the option to delete selected rows.
next part is the problem above. hashing all leftover unique files, comparing them with a hashtable (database), then display the results in a datagridview. if I click on a field it will show the image in an imagebox (resized) then I can delete the found files, select the next hashtable database and compare the leftover files without rehashing (I'm clever, LOL)
another part allows me to add new hashes to the databases
another part allows me to compare the databases with themself or each other, so I can throw out duplicate hashes
(and some other minor parts/functions I need)
and well, I can't complain, it does it's job, saves me a lot of time
and with your help it works a little better now
my problem is, I don't learn that well by books, I usually learn by doing.
for example: just recently I thought it might be faster to make a dataview using the hashtable and then search the hash of a new file in it. I found out it's WAY faster than how I did it before.
Before I had 2 FOR...NEXT loops, compare a row from table_a with every row in table_b. the hashtable grows and grows, so doing it that way takes a lot of time. 400,000,000 comparisons took more than 5 minutes, maybe 10.
now it's so fast it takes like 3 seconds or something
Thank you
|
|
|
|
|
Your hash values not being correct means your code is not OK; you have to fix that first, before you can start working on performance and multi-threading.
BTW: to keep things simple, I would not touch any DataTable/DataSet in the threads, just store the results in an array, or a Dictionary<string filename, int hashValue> ; then after the joins, enumerate and store the results.
|
|
|
|
|
Using multiple threads will be helpful if it's possible for the different threads to simultaneously be performing useful work, generally using different resources. If two or more threads need to read files off the same physical disk, each thread will likely have to wait any time other threads access the disk, so adding additional threads won't help anything.
Note that because the operating system has its own caching and read-ahead logic, it's often difficult to predict how any particular piece of code will perform. I'm sure that in general the system caching boosts performance, but it does make it much harder to optimize code.
|
|
|
|
|
Hello
yes, I noticed that caching thing. if I restart the process it usually hashes a lot faster
but that won't happen if I use new files or different folders, at least it seems so because it as slow as on 1st start
hashing simultaneously was my idea
the files in the folder are pictures, usual folder size is around 200 MB, though it can vary.
I'm not sure but I think it should at least be faster than trying to hash files that are 200 MB each, copying the files doesn't take that much time either.
I know of a (professional) program that can find duplicate images. It allows the user to set how many threads should be used for processing files. It processes files in a list which are added while scanning for these files in folders.
It works really fast, also uses a progressbar and shows div. informations while working.
And the hdd is really active during this.
It just hard to find informations on the WWW that show how to do that
Nik
|
|
|
|
|
I'd like to have a utility to find duplicate files and implement a reasonable backup/archiving approach. One thing I would think might be helpful with large files would be to start by producing a catalog of file sizes. If a file size is unique, one needn't hash anything to know that the file isn't going to match any other. Otherwise, for large files, one could compute a 'quick hash' value by hashing a few 64K chunks of data taken from different areas of the file. It two files have identical quick-hashes, they may or may not be identical, but if a file's quick-hash is unique that's a sure sign that the file is.
|
|
|
|
|
yes, 2 good ideas IMO.
1st one is how I do it, getting filename and filesize into a datatable, sort it by filesize, compare it with itself to find duplicate rows and import them into a new table.
then process that table, hash all files in it (I hash completely since I'm doing that with pictures which aren't bigger than 2-3 MB) and again compare the datatable, removing unique rows (hash column)
works very well
2nd describes what some duplicate file finders do, hashing the 1st bytes of files with same size, if they are identical it'll hash some more bytes until it finds a difference. at least this is what I found out from my research lol
|
|
|
|
|
I need to make a program that generates 15 random numbers as an array and then lists them in a listbox. I also need to be able to display the maximum and minimum in a label. Heres what I've got so far...
Public Class Form1
Dim strNumbers(14) As String
Private Sub btnGenerate_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles btnGenerate.Click
Dim intRandom As New Random()
Dim intLoop As Integer
lstOutcome.Items.Clear()
For intLoop = 0 To strNumbers.Length - 1
lstOutcome.Items.Add(intRandom.Next(1, 100))
Next intLoop
End Sub
Private Sub btnMaximum_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles btnMaximum.Click
Dim intLoop As Integer
Dim intMax As Integer
For intLoop = 0 To strNumbers.Length - 1
intMax = strNumbers(intLoop)
If intMax < strNumbers(intLoop) Then
intMax = strNumbers(intLoop)
End If
Next intLoop
lblMinMax.Text = intMax
End Sub
End Class
I've got the 15 random numbers in the listbox. I just cant figure out why my maximum code isnt working. Any tips would be awesome right now...
|
|
|
|
|
look again right here:
intMax = strNumbers(intLoop)
If intMax < strNumbers(intLoop) Then
|
|
|
|
|
Put intMax initialization outside for loop. BTW, you can use a List instead of array since it has a sort method. Then all you would need is to get first and last element.
|
|
|
|
|
It worked...to do the minimum I should only have to change the sign I guess. Thanks a lot
|
|
|
|
|
I also figured out that the way I generated the random numbers made the code mess up. I ended up having to use
strNumbers(intLoop) = Int((100 - 1 + 1) * Rnd()) + 1 to get the random numbers.
|
|
|
|
|
I'm trying to develop an addin for Outlook 2003 so that when a new calendar item is added, it fires an event that passes along the new calendar item's information to a web service. I already have the web service working, and I can get the addin to load in outlook. What I can't figure out is how to tie the adding of a new calendar event to a function. Any guidance would be appreciated.
|
|
|
|
|