|
Getting the Blank Command prompt.Please check the following log for that.
'ex4.exe': Loaded 'C:\Users\177741\Documents\Visual Studio 2008\Projects\ex4\Debug\ex4.exe', Symbols loaded.
'ex4.exe': Loaded 'C:\Windows\System32\ntdll.dll'
'ex4.exe': Loaded 'C:\Windows\System32\kernel32.dll'
'ex4.exe': Loaded 'C:\Users\177741\Documents\Visual Studio 2008\Projects\ex4\Debug\zlib1.dll', Binary was not built with debug information.
'ex4.exe': Loaded 'C:\Windows\System32\msvcrt.dll'
'ex4.exe': Loaded 'C:\Windows\winsxs\x86_microsoft.vc90.debugcrt_1fc8b3b9a1e18e3b_9.0.21022.8_none_96748342450f6aa2\msvcr90d.dll'
The program '[4312] ex4.exe: Native' has exited with code 0 (0x0).
|
|
|
|
|
fine code,but in english
in french and other languages
with diacritic characters returns
numeric codes
could suugest a fix ?
|
|
|
|
|
unit pdftotext;
{$mode objfpc}{$H+}
//Converted from http://www.codeproject.com/KB/cpp/ExtractPDFText.aspx
//Original author http://www.codeproject.com/Members/NeWi
//Converted by Domingo Alvarez Duarte mingodadATgmailDOTcom
//Original source file pdf.cpp
//This file contains extremely crude pascal source code to extract plain text
//from a PDF file. It is only intended to show some of the basics involved
//in the process and by no means good enough for commercial use.
//But it can be easily modified to suit your purpose. Code is by no means
//warranted to be bug free or suitable for any purpose.
//Adobe has a web site that converts PDF files to text for free,
//so why would you need something like this? Several reasons:
//1) This code is entirely free including for commericcial use. It only
// requires PAZLIB which is entirely free as well.
//2) This code tries to put tabs into appropriate places in the text,
// which means that if your PDF file contains mostly one large table,
// you can easily take the output of this program and directly read it
// into Excel! Otherwise if you select and copy the text and paste it into
// Excel there is no way to extract the various columns again.
//This code assumes that the PDF file has text objects compressed
//using FlateDecode (which seems to be standard).
//This code is free. Use it for any purpose.
//The author assumes no liability whatsoever for the use of this code.
//Use it at your own risk!
//PDF file strings (based on PDFReference15_v5.pdf from www.adobve.com:
//BT = Beginning of a text object, ET = end of a text object
//5 Ts = superscript
//-5 Ts = subscript
//Td move to start next line
interface
uses
Classes;
function pdf2text(pdfFN: string): boolean;
function pdfStream2textStream(mStreamIn, mStreamOut: TMemoryStream): boolean;
implementation
uses
SysUtils, paszlib;
const
cStream = 'stream';
cEndStream = 'endstream';
cCR = #13;
cNL = #10;
cCRNL = cCR + cNL;
cTab = #9;
cBlanks = [' ', cCR, cNL];
cDigits = ['0'..'9'];
cDigitsDot = ['0'..'9','.'];
//Find a string in a buffer:
function FindStringInBuffer(buffer, search: PChar; buffersize: integer): integer;
var
buffer0: PChar;
len, i: integer;
fnd: boolean;
begin
buffer0 := buffer;
len := strlen(search);
fnd := False;
while not fnd do
begin
fnd := True;
for i := 0 to len - 1 do
begin
if (buffer[i] <> search[i]) then
begin
fnd := False;
break;
end;
end;
if (fnd) then
exit(buffer - buffer0);
Inc(buffer);
if ((buffer - buffer0 + len) >= buffersize) then
exit(-1);
end;
Result := -1;
end;
//Keep this many previous recent characters for back reference:
//#define oldchar 15
const
cOldChar = 15;
//Convert a recent set of characters into a number if there is one.
//Otherwise return -1:
function ExtractNumber(search: PChar; lastCharOffset: integer): real;
var
iStart, iEnd: integer;
buffer: array[0..(cOldChar + 5)] of char;
begin
iEnd := lastcharoffset;
while (iEnd > 0) and not (search[iEnd] in cDigits) do
Dec(iEnd);
iStart := iEnd;
while (iStart > 0) and (search[iStart] in cDigitsDot) do
Dec(iStart);
Result := -1.0;
FillChar(buffer, 0, sizeof(buffer));
strlcopy(buffer, search + iStart + 1, iEnd - iStart);
if (buffer[0] <> #0) then result := StrToFloatDef(buffer, -1);
end;
//This method processes an uncompressed Adobe (text) object and extracts text.
procedure ProcessOutput(oStream: TStream; output: PChar; len: integer);
var
inTextObject, nextLiteral: boolean;
rbdepth, j, i: integer;
oc: array[0..cOldChar] of char;
c: char;
num: real;
//Check if a certain 2 character token just came along (e.g. BT):
function seen2(search, recent: PChar): boolean;
begin
Result := (recent[cOldChar - 3] = search[0]) and
(recent[cOldChar - 2] = search[1]) and
(recent[cOldChar - 1] in cBlanks) and
(recent[cOldChar - 4] in cBlanks);
end;
begin
//writeln(output);
//Are we currently inside a text object?
inTextObject := False;
//Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
nextLiteral := False;
//() Bracket nesting level. Text appears inside ()
rbdepth := 0;
//Keep previous chars to get extract numbers etc.:
for j := 0 to cOldChar - 1 do
oc[j] := ' ';
for i := 0 to len - 1 do
begin
c := output[i];
if (inTextObject) then
begin
if (rbdepth = 0) and seen2('TD', oc) then
begin
//Positioning.
//See if a new line has to start or just a tab:
num := ExtractNumber(oc, cOldChar - 5);
if (num > 1.0) then
oStream.Write(cCRNL, 2);
if (num < 1.0) then
oStream.Write(cTab, 1);
end;
if (rbdepth = 0) and seen2('ET', oc) then
begin
//End of a text object, also go to a new line.
inTextObject := False;
oStream.Write(cCRNL, 2);
end
else if (c = '(') and (rbdepth = 0) and (not nextLiteral) then
begin
//Start outputting text!
rbdepth := 1;
//See if a space or tab (>1000) is called for by looking
//at the number in front of (
num := ExtractNumber(oc, cOldChar - 1);
if (num > 0) then
begin
if (num > 1000.0) then
oStream.Write(cTab, 1)
else if (num > 100.0) then
oStream.Write(' ', 1);
end;
end
else if (c = ')') and (rbdepth = 1) and (not nextLiteral) then
begin
//Stop outputting text
rbdepth := 0;
end
else if (rbdepth = 1) then
begin
//Just a normal text character:
if (c = '\') and (not nextLiteral) then
begin
//Only print out next character no matter what. Do not interpret.
nextliteral := True;
end
else
begin
nextliteral := False;
if ((c >= ' ') and (c <= '~')) or
((Byte(c) >= 128) and (Byte(c) < 255)) then
begin
oStream.Write(c, 1);
end;
end;
end;
end;
//Store the recent characters for when we have to go back for a number:
for j := 0 to cOldChar - 2 do
oc[j] := oc[j + 1];
oc[cOldChar - 1] := c;
if not inTextObject then
begin
if seen2('BT', oc) then
begin
//Start of a text object:
inTextObject := True;
end;
end;
end;
end;
function pdfStream2textStream(mStreamIn, mStreamOut: TMemoryStream): boolean;
var
moreStreams: boolean;
streamStart, streamEnd, nextStreamStart, filelen, outsize, i: integer;
buffer, output: PChar;
zstrm: TZstream;
begin
buffer := PChar(mStreamIn.Memory);
filelen := mStreamIn.Size;
output := nil;
outsize := 0;
moreStreams := True;
//Now search the buffer repeated for streams of data:
while moreStreams do
begin
//Search for stream, endstream. We ought to first check the filter
//of the object to make sure it if FlateDecode, but skip that for now!
streamStart := FindStringInBuffer(buffer, cStream, filelen);
streamEnd := FindStringInBuffer(buffer, cEndStream, filelen);
nextStreamStart := streamEnd + sizeof(cEndStream) + 1;
if (streamStart > 0) and (streamEnd > streamStart) then
begin
//Skip to beginning and end of the data stream:
Inc(streamStart, sizeof(cStream) {6});
if (buffer[streamStart] = cCR {0x0d}) and
(buffer[streamstart + 1] = cNL {0x0a}) then
Inc(streamStart, 2)
else if (buffer[streamstart] = cNL {0x0a}) then
Inc(streamStart);
if (buffer[streamend - 2] = cCR {0x0d}) and
(buffer[streamend - 1] = cNL {0x0a}) then
Dec(streamEnd, 2)
else if (buffer[streamend - 1] = cNL {0x0a}) then
Dec(streamEnd);
//Assume output will fit into 10 times input buffer:
i := (streamEnd - streamStart) * 10;
if i > outsize then
begin
ReAllocMem(output, i);
outsize := i;
end;
FillChar(output, 0, outsize);
//Now use zlib to inflate:
//z_stream zstrm; ZeroMemory(&zstrm, sizeof(zstrm));
FillChar(zstrm, 0, SizeOf(zstrm));
zstrm.avail_in := streamEnd - streamStart + 1;
zstrm.avail_out := outsize;
zstrm.next_in := PByte(buffer + streamstart);
zstrm.next_out := Pbyte(output);
if (inflateInit(zstrm) = Z_OK) and
(inflate(zstrm, Z_FINISH) >= 0) then
//Ok, got something, extract the text:
ProcessOutput(mStreamOut, output, zstrm.total_out);
Inc(buffer, nextStreamStart);
Dec(filelen, nextStreamStart);
end
else
morestreams := False;
end;
FreeMem(output);
Result := True;
end;
function pdf2text(pdfFN: string): boolean;
var
mStreamIn, mStreamOut: TMemoryStream;
begin
mStreamIn := TMemoryStream.Create;
mStreamOut := TMemoryStream.Create;
//Read the entire file into memory (!):
mStreamIn.LoadFromFile(pdfFN);
Result := pdfStream2textStream(mStreamIn, mStreamOut);
mStreamIn.Free;
mStreamOut.SaveToFile(pdfFN + '.txt');
mStreamOut.Free;
end;
end.
modified on Thursday, July 30, 2009 6:07 AM
|
|
|
|
|
unit pdftotext;
{$mode objfpc}{$H+}
//Converted from http://www.codeproject.com/KB/cpp/ExtractPDFText.aspx
//Original author http://www.codeproject.com/Members/NeWi
//Converted by Domingo Alvarez Duarte mingodadATgmailDOTcom
//Original source file pdf.cpp
//This file contains extremely crude pascal source code to extract plain text
//from a PDF file. It is only intended to show some of the basics involved
//in the process and by no means good enough for commercial use.
//But it can be easily modified to suit your purpose. Code is by no means
//warranted to be bug free or suitable for any purpose.
//Adobe has a web site that converts PDF files to text for free,
//so why would you need something like this? Several reasons:
//1) This code is entirely free including for commericcial use. It only
// requires PAZLIB which is entirely free as well.
//2) This code tries to put tabs into appropriate places in the text,
// which means that if your PDF file contains mostly one large table,
// you can easily take the output of this program and directly read it
// into Excel! Otherwise if you select and copy the text and paste it into
// Excel there is no way to extract the various columns again.
//This code assumes that the PDF file has text objects compressed
//using FlateDecode (which seems to be standard).
//This code is free. Use it for any purpose.
//The author assumes no liability whatsoever for the use of this code.
//Use it at your own risk!
//PDF file strings (based on PDFReference15_v5.pdf from www.adobve.com:
//BT = Beginning of a text object, ET = end of a text object
//5 Ts = superscript
//-5 Ts = subscript
//Td move to start next line
interface
uses
Classes;
function pdf2text(pdfFN: string): boolean;
function pdfStream2textStream(mStreamIn, mStreamOut: TMemoryStream): boolean;
implementation
uses
SysUtils, paszlib;
const
cStream = 'stream';
cEndStream = 'endstream';
cCR = #13;
cNL = #10;
cCRNL = cCR + cNL;
cTab = #9;
cBlanks = [' ', cCR, cNL];
cDigits = ['0'..'9'];
cDigitsDot = ['0'..'9','.'];
c64KB = 60 * 1024;
//Find a string in a buffer:
function FindStringInBuffer(buffer, search: PChar; buffersize: integer): integer;
var
buffer0: PChar;
len, i: integer;
fnd: boolean;
begin
buffer0 := buffer;
len := strlen(search);
fnd := False;
while not fnd do
begin
fnd := True;
for i := 0 to len - 1 do
begin
if (buffer[i] <> search[i]) then
begin
fnd := False;
break;
end;
end;
if (fnd) then
exit(buffer - buffer0);
Inc(buffer);
if ((buffer - buffer0 + len) >= buffersize) then
exit(-1);
end;
Result := -1;
end;
//Keep this many previous recent characters for back reference:
//#define oldchar 15
const
cOldChar = 15;
//This method processes an uncompressed Adobe (text) object and extracts text.
procedure ProcessOutput(oStream: TStream; output: PChar; len: integer);
var
inTextObject, nextLiteral: boolean;
rbdepth, iPos, lineLen, iLastCharPos, iTextFragments: integer;
c, lastC: char;
num: real;
//Check if a certain 2 character token just came along (e.g. BT):
function seen2(search: PChar): boolean;
begin
Result := (iPos > 4) and
(output[iPos - 4] = search[0]) and
(output[iPos - 3] = search[1]) and
(output[iPos - 2] in cBlanks) and
(output[iPos - 5] in cBlanks);
end;
procedure writeCharOut(c: char);
begin
lastC := c;
if lastC = cNL then lineLen := 0
else Inc(lineLen);
oStream.Write(lastC, 1);
end;
begin
//writeln(output);
//Are we currently inside a text object?
inTextObject := False;
//Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
nextLiteral := False;
//() Bracket nesting level. Text appears inside ()
rbdepth := 0;
//Last char sent to the output
lastC := #0;
iLastCharPos := 0;
iTextFragments := 0;
//Current line lenght sent to output
lineLen := 0;
for iPos := 0 to len - 1 do
begin
c := output[iPos];
if (inTextObject) then
begin
if (rbdepth = 0) and seen2('TD') then
begin
//Positioning.
//See if a new line has to start or just a tab:
end;
if (rbdepth = 0) and seen2('ET') then
begin
//End of a text object, also go to a new line.
inTextObject := False;
writeCharOut(cCR);
writeCharOut(cNL);
iTextFragments := 0;
end
else if (c = '(') and (rbdepth = 0) and (not nextLiteral) then
begin
//Start outputting text!
rbdepth := 1;
inc(iTextFragments);
// see if the last sent char is an space
if (iTextFragments > 1) then
begin
(*//when line length bigger than x
if (lineLen > 80) then
begin
writeCharOut(cCR);
writeCharOut(cNL);
end
else*)
//When 2 or more line exist a space betwen then
if(output[iLastCharPos] <> ' ') then writeCharOut(' ');
end;
end
else if (c = ')') and (rbdepth = 1) and (not nextLiteral) then
begin
//Stop outputting text
rbdepth := 0;
end
else if (rbdepth = 1) then
begin
//Just a normal text character:
if (c = '\') and (not nextLiteral) then
begin
//Only print out next character no matter what. Do not interpret.
nextliteral := True;
end
else
begin
nextliteral := False;
if ((c >= ' ') and (c <= '~')) or
((Byte(c) >= 128) and (Byte(c) < 255)) then
begin
writeCharOut(c);
iLastCharPos := iPos;
end;
end;
end;
end;
//Store the recent characters for when we have to go back for a number:
if not inTextObject then
begin
if seen2('BT') then
begin
//Start of a text object:
inTextObject := True;
end;
end;
end;
end;
function pdfStream2textStream(mStreamIn, mStreamOut: TMemoryStream): boolean;
var
moreStreams: boolean;
streamStart, streamEnd, nextStreamStart, filelen, outsize, newoutsize: integer;
buffer, output: PChar;
zstrm: TZstream;
begin
buffer := PChar(mStreamIn.Memory);
filelen := mStreamIn.Size;
output := nil;
outsize := 0;
moreStreams := True;
//Now search the buffer repeated for streams of data:
while moreStreams do
begin
//Search for stream, endstream. We ought to first check the filter
//of the object to make sure it if FlateDecode, but skip that for now!
streamStart := FindStringInBuffer(buffer, cStream, filelen);
streamEnd := FindStringInBuffer(buffer, cEndStream, filelen);
nextStreamStart := streamEnd + sizeof(cEndStream) + 1;
if (streamStart > 0) and (streamEnd > streamStart) then
begin
//Skip to beginning and end of the data stream:
Inc(streamStart, sizeof(cStream) {6});
if (buffer[streamStart] = cCR {0x0d}) and
(buffer[streamstart + 1] = cNL {0x0a}) then
Inc(streamStart, 2)
else if (buffer[streamstart] = cNL {0x0a}) then
Inc(streamStart);
if (buffer[streamend - 2] = cCR {0x0d}) and
(buffer[streamend - 1] = cNL {0x0a}) then
Dec(streamEnd, 2)
else if (buffer[streamend - 1] = cNL {0x0a}) then
Dec(streamEnd);
//Assume output will fit into 10 times input buffer:
//newoutsize := (streamEnd - streamStart) * 10;
// most of the time the streams we will work with
// aren't bigger than 65 KB after inflate
newoutsize := c64KB;
// Inside this loop we will adjust the memory needed
while True do
begin
if newoutsize > outsize then
begin
ReAllocMem(output, newoutsize);
outsize := newoutsize;
end;
//FillChar(output, 0, outsize);
//Now use zlib to inflate:
//z_stream zstrm; ZeroMemory(&zstrm, sizeof(zstrm));
FillChar(zstrm, 0, SizeOf(zstrm));
zstrm.avail_in := streamEnd - streamStart + 1;
zstrm.avail_out := outsize;
zstrm.next_in := PByte(buffer + streamstart);
zstrm.next_out := Pbyte(output);
if (inflateInit(zstrm) = Z_OK) and
(inflate(zstrm, Z_FINISH) >= 0) then
begin
if zstrm.total_out = outsize then
begin
//we haven't allocated enough space
newoutsize := outsize * 2;
// once a new value is set let's start again
inflateEnd(zstrm);
continue;
end;
//writeln(newoutsize,':', newoutsize div 10 ,':', zstrm.total_out,':', outsize);
if zstrm.total_out > 0 then
begin
//Ok, got something, extract the text:
ProcessOutput(mStreamOut, output, zstrm.total_out);
end;
end;
inflateEnd(zstrm);
break;
end;
Inc(buffer, nextStreamStart);
Dec(filelen, nextStreamStart);
end
else
morestreams := False;
end;
FreeMem(output);
Result := True;
end;
function pdf2text(pdfFN: string): boolean;
var
mStreamIn, mStreamOut: TMemoryStream;
begin
mStreamIn := TMemoryStream.Create;
mStreamOut := TMemoryStream.Create;
//Read the entire file into memory (!):
mStreamIn.LoadFromFile(pdfFN);
Result := pdfStream2textStream(mStreamIn, mStreamOut);
mStreamIn.Free;
mStreamOut.SaveToFile(pdfFN + '.txt');
mStreamOut.Free;
end;
initialization
end.
|
|
|
|
|
Since I'm trying to use this in a recursion process, i.e. converting multiple pdf files. I have tried to use delete[] to delete the buffer so that I can free the memory after extracting the whole file.
I'm getting assertion errors
please help
|
|
|
|
|
it works great, i even made a linux version it was very easy just had to change ZeroMemory with memset. 10x alot
|
|
|
|
|
I don't suppose anyone knows of a .Net conversion of this (VB, C#, C++, I'm not picky).
|
|
|
|
|
|
Hi NeWi,
I'm creating an compression application using the zlib, i have included the zlib.h in the header files. I'm facing a linker error
"error LNK2019: unresolved external symbol _deflateInit_ referenced in function "protected: static bool __cdecl Compressor::ZlibCompress"
when i call the ret = deflateInit(&zstm, Z_BEST_COMPRESSION)
Helpfull if solution provided ASAP.
Thanks for replying,
Mayur M
|
|
|
|
|
hi
I had the same problem.
I added zdll.lib and zlib1.dll to "Resource Files" folder of the project workspace sideview (in vc++ 6) then things went better.
I hope this will work for you too.
|
|
|
|
|
Well i am a beginner and I want to develop this project. So could you please provide me with a step by step guideline (procedure) of implementation of this code like in which version of C++ do I have to write this code? , How to make an application for this? etc. Please help me out with a procedure ( step by step ) starting right from basic step .
|
|
|
|
|
Hello. I want to recognize the pages within the pdf document. So either to exract page1, page2,.... lastpage. Or in a way that I can get a plain text like this one:
<newpage>
John said to Sally....
<newpage>
And he also told her...
<newpage>
and the story goes on...
<newpage>
Till the end, no new page after this.
|
|
|
|
|
Alexander,
I am no PDF guru but have been looking at PDF's heavily over the last few days and might be able to help!
From what I've seen is that in the PDF (depending on formatting I guess?) you have your streams i.e. objects that are text. Then directly after these streams you have more objects that define your page setup. From what I've seen this is as follows. First you have an object that is defined like below (basically number of pages):
225 0 obj
<< /Type /Pages
/Kids [
226 0 R
227 0 R
228 0 R
229 0 R
]
/Count 4
>>
endobj
Next you have each page in turn and within this it looks as though you 'call' the previously defined streams (or any other objects) to appear as the CONTENTS on these pages i.e:
'''''Page 1'''''
226 0 obj
<< /Type /Page
/Parent 225 0 R
/MediaBox [0 0 595 841 ]
/Contents [
7 0 R
9 0 R
11 0 R
13 0 R
15 0 R
17 0 R
19 0 R
21 0 R
23 0 R
25 0 R
27 0 R
29 0 R
31 0 R
33 0 R
35 0 R
37 0 R
]
/Annots [
72 0 R
]
/Resources <<
/ProcSet 1 0 R
/Font << /F1 2 0 R
/F2 3 0 R
/F3 4 0 R
/F4 5 0 R >>
>>
>>
endobj
'''''Page 2'''''
227 0 obj
<< /Type /Page
/Parent 225 0 R
/MediaBox [0 0 595 841 ]
/Contents [
7 0 R
9 0 R
81 0 R
83 0 R
85 0 R
87 0 R
89 0 R
91 0 R
93 0 R
95 0 R
97 0 R
99 0 R
101 0 R
]
/Annots [
]
/Resources <<
/ProcSet 1 0 R
/Font << /F1 2 0 R
/F2 3 0 R
/F3 4 0 R
/F4 5 0 R >>
>>
>>
endobj
So I imagine you could write some code that looks for the objects on a certain page?? I hope this could help you in some way! Good Luck!
|
|
|
|
|
I am doing an important project. Do you have the code that can read a PDF file containing a table and store the table in EXCELL or notepad or .doc
Leon
|
|
|
|
|
Thanks for this code. It saved me TONS of time. There is a bug though. Need to check for zero lenght streams (seems odd but I have encountered this). When streamend < streamstart just set streamstart to equal streamend.
Also, I am noticing that some pdf docs do not decode numbers correctly (while text decodes fine in same doc...)
Thanks again!
|
|
|
|
|
Not sure what you mean: the code already has
if (streamstart>0 && streamend>streamstart)<br />
{<br />
...<br />
}<br />
thus if streamend would be smaller than streamstart it wouldn't execute that part anyway?!
|
|
|
|
|
you application save my work
|
|
|
|
|
I get TAB (09) instead of the russian letters.
May be I remove "stdafx.h" from code?
modified on Wednesday, May 14, 2008 2:47 AM
|
|
|
|
|
i am using VC++ version 6.0. i my project i need to extract text from a pdg page, i downloaded the code ,it seems very usefull to me. but i am getting Error as : unresolved external symbol __endthreadex.
Can anyone help me for solving it.
Advance Thanks...
|
|
|
|
|
Hi all,
I hope that someone can solve my problem. Now, I writing the MFC window application.
My application have some data in the textbox. I want to save that textbox's data to somewhere by text file. I don't know how to output the text file.
If someone know, pls help me.
Regards,
tun
|
|
|
|
|
First of all a big thaks to NeWi
You piece of code helped me a lot.
The memory leak in pdf.cpp
at line 223:
char* buffer = new char [filelen]; (is never deleted)
bad pointer handling at line 270:
buffer+= streamend + 7;
this means that even if buffer is deleted there will
still be some memory left.
My solution:
char* bufferRoot = new char [filelen]; (is never deleted)
char* buffer = bufferRoot;
and then at the end of the if scoope in which the buffer
was created:
delete[] bufferRoot;
Thanks again NeWi
Best regards
Asger-P
|
|
|
|
|
The following is a simple change to fit the need to extract text in Chinese.
Note that this version is still not a complete version. please refer to the PDF specification to make the version complete.
=======================================================================
// PDFTest.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
//This file contains extremely crude C source code to extract plain text
//from a PDF file. It is only intended to show some of the basics involved
//in the process and by no means good enough for commercial use.
//But it can be easily modified to suit your purpose. Code is by no means
//warranted to be bug free or suitable for any purpose.
//
//Adobe has a web site that converts PDF files to text for free,
//so why would you need something like this? Several reasons:
//
//1) This code is entirely free including for commericcial use. It only
// requires ZLIB (from www.zlib.org) which is entirely free as well.
//
//2) This code tries to put tabs into appropriate places in the text,
// which means that if your PDF file contains mostly one large table,
// you can easily take the output of this program and directly read it
// into Excel! Otherwise if you select and copy the text and paste it into
// Excel there is no way to extract the various columns again.
//
//This code assumes that the PDF file has text objects compressed
//using FlateDecode (which seems to be standard).
//
//This code is free. Use it for any purpose.
//The author assumes no liability whatsoever for the use of this code.
//Use it at your own risk!
//PDF file strings (based on PDFReference15_v5.pdf from www.adobve.com:
//
//BT = Beginning of a text object, ET = end of a text object
//5 Ts = superscript
//-5 Ts = subscript
//Td move to start next line
//No precompiled headers, but uncomment if need be:
#include "stdafx.h"
#include <stdio.h>
#include <windows.h>
//YOur project must also include zdll.lib (ZLIB) as a dependency.
//ZLIB can be freely downloaded from the internet, www.zlib.org
//Use 4 byte struct alignment in your project!
#include "zlib.h"
//Find a string in a buffer:
size_t FindStringInBuffer (char* buffer, char* search, size_t buffersize)
{
char* buffer0 = buffer;
size_t len = strlen(search);
bool fnd = false;
while (!fnd)
{
fnd = true;
for (size_t i=0; i<len; i++)
{
if (buffer[i]!=search[i])
{
fnd = false;
break;
}
}
if (fnd) return buffer - buffer0;
buffer = buffer + 1;
if (buffer - buffer0 + len >= buffersize) return -1;
}
return -1;
}
//Keep this many previous recent characters for back reference:
#define oldchar 15
//Convert a recent set of characters into a number if there is one.
//Otherwise return -1:
float ExtractNumber(const char* search, int lastcharoffset)
{
int i = lastcharoffset;
while (i>0 && search[i]==' ') i--;
while (i>0 && (isdigit(search[i]) || search[i]=='.')) i--;
// Added by XJ Yang:
if(search[i] == '-')
i--;
// Ended of Added by Xj Yang.
float flt=-1.0;
char buffer[oldchar+5]; ZeroMemory(buffer,sizeof(buffer));
strncpy(buffer, search+i+1, lastcharoffset-i);
if (buffer[0] && sscanf(buffer, "%f", &flt))
{
return flt;
}
return -1.0;
}
//Check if a certain 2 character token just came along (e.g. BT):
bool seen2(const char* search, char* recent)
{
if ( recent[oldchar-3]==search[0]
&& recent[oldchar-2]==search[1]
&& (recent[oldchar-1]==' ' || recent[oldchar-1]==0x0d || recent[oldchar-1]==0x0a)
&& (recent[oldchar-4]==' ' || recent[oldchar-4]==0x0d || recent[oldchar-4]==0x0a)
)
{
return true;
}
return false;
}
unsigned char TwoChars2Hex(unsigned char c1, unsigned char c2)
{
// 该函数将一个形如 'F', '3' 的两个字符, 看作一个16进制的两位, 转变为一个形如0xF3的字符:
unsigned char c;
if( isdigit(c1))
c = (c1 - '0');
else if( c1 >= 'a' && c1 <='z')
c = (c1 - 'a');
else if( c1 >= 'A' && c1 <='Z')
c = (c1 - 'A') + 10;
else
c = 0;
if( isdigit(c2))
c = c * 16 + (c2 - '0');
else if( c2 >= 'a' && c2 <='z')
c = c * 16 + (c2 - 'a');
else if( c2 >= 'A' && c2 <='Z')
c = c * 16 + (c2 - 'A') + 10;
else
c = c * 16;
return c;
}
//This method processes an uncompressed Adobe (text) object and extracts text.
void ProcessOutput(FILE* file, char* output, size_t len)
{
//Are we currently inside a text object?
bool intextobject = false;
//Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
bool nextliteral = false;
//() Round bracket nesting level. Text appears inside ()
int rbdepth = 0;
// Added by XJ Yang:
//<> Point bracket nesting level. Text appears inside <>
int pbdepth = 0;
// Ended of added by XJ Yang.
//Keep previous chars to get extract numbers etc.:
char oc[oldchar];
int j=0;
for (j=0; j<oldchar; j++) oc[j]=' ';
for (size_t i=0; i<len; i++)
{
// Changed by XJ Yang:
// char c = output[i];
unsigned char c = output[i];
// End of changed by XJ Yang.
if (intextobject)
{
if (rbdepth==0 && seen2("TD", oc))
{
//Positioning.
//See if a new line has to start or just a tab:
float num = ExtractNumber(oc,oldchar-5);
// Changed by XJ Yang:
/*
if (num>1.0)
{
fputc(0x0d, file);
fputc(0x0a, file);
}
if (num<1.0)
{
// Changed by XJ Yang:
// fputc('\t', file);
// End of changed by XJ Yang.
}
*/
// Ended of changed by Xj Yang.
if (num > 1.0)
{
fputc(0x0d, file);
fputc(0x0a, file);
}
}
// Added by XJ yang:
if (pbdepth==0 && seen2("Tw", oc))
{
float num = ExtractNumber(oc,oldchar-5);
if (num < 1.0 && num > 0)
{
fputc(0x0d, file);
fputc(0x0a, file);
}
}
// End of added by XJ Yang.
if (rbdepth==0 && seen2("ET", oc))
{
//End of a text object, also go to a new line.
intextobject = false;
fputc(0x0d, file);
fputc(0x0a, file);
}
else if (c=='(' && rbdepth==0 && !nextliteral)
{
//Start outputting text!
rbdepth=1;
//See if a space or tab (>1000) is called for by looking
//at the number in front of (
int num = (int) ExtractNumber(oc,oldchar-1);
if (num>0)
{
if (num>1000.0)
{
fputc('\t', file);
}
else if (num>100.0)
{
fputc(' ', file);
}
}
}
else if (c==')' && rbdepth==1 && !nextliteral)
{
//Stop outputting text
rbdepth=0;
}
else if (rbdepth==1)
{
//Just a normal text character:
if (c=='\\' && !nextliteral)
{
//Only print out next character no matter what. Do not interpret.
nextliteral = true;
}
else
{
nextliteral = false;
if ( ((c>=' ') && (c<='~')) || ((c>=128) && (c<255)) )
{
fputc(c, file);
}
}
}
// Added by XJ Yang:
else if (c=='<' && pbdepth==0 && !nextliteral)
{
//Start outputting text!
pbdepth=1;
//See if a space or tab (>1000) is called for by looking
//at the number in front of (
int num = (int) ExtractNumber(oc,oldchar-1);
if (num>0)
{
if (num>1000.0)
{
fputc('\t', file);
}
else if (num>100.0)
{
fputc(' ', file);
}
}
}
else if (c=='>' && pbdepth==1 && !nextliteral)
{
//Stop outputting text
pbdepth=0;
}
else if (pbdepth==1)
{
//Just a normal text character:
if (c=='\\' && !nextliteral)
{
//Only print out next character no matter what. Do not interpret.
nextliteral = true;
}
else
{
nextliteral = false;
if ( ((c>=' ') && (c<='~')) || ((c>=128) && (c<255)) ) // 是汉字:
{
unsigned char c2 = output[++i];
unsigned char combinedChar = TwoChars2Hex(c, c2);
fputc(combinedChar, file);
}
}
}
// End of added by Xj Yang.
}
//Store the recent characters for when we have to go back for a number:
for (j=0; j<oldchar-1; j++) oc[j]=oc[j+1];
oc[oldchar-1]=c;
if (!intextobject)
{
if (seen2("BT", oc))
{
//Start of a text object:
intextobject = true;
}
}
}
}
int main(int argc, char * argv[])
{
//Discard existing output:
FILE* fileo = fopen("output.txt", "w");
if (fileo) fclose(fileo);
fileo = fopen("output.txt", "a");
//Open the PDF source file:
FILE* filei = fopen("Some.pdf", "rb");
if (filei && fileo)
{
//Get the file length:
int fseekres = fseek(filei,0, SEEK_END); //fseek==0 if ok
long filelen = ftell(filei);
fseekres = fseek(filei,0, SEEK_SET);
//Read ethe ntire file into memory (!):
char* buffer = new char [filelen]; ZeroMemory(buffer, filelen);
size_t actualread = fread(buffer, filelen, 1 ,filei); //must return 1
bool morestreams = true;
//Now search the buffer repeated for streams of data:
while (morestreams)
{
//Search for stream, endstream. We ought to first check the filter
//of the object to make sure it if FlateDecode, but skip that for now!
size_t streamstart = FindStringInBuffer (buffer, "stream", filelen);
size_t streamend = FindStringInBuffer (buffer, "endstream", filelen);
if (streamstart>0 && streamend>streamstart)
{
//Skip to beginning and end of the data stream:
streamstart += 6;
if (buffer[streamstart]==0x0d && buffer[streamstart+1]==0x0a) streamstart+=2;
else if (buffer[streamstart]==0x0a) streamstart++;
if (buffer[streamend-2]==0x0d && buffer[streamend-1]==0x0a) streamend-=2;
else if (buffer[streamend-1]==0x0a) streamend--;
//Assume output will fit into 10 times input buffer:
size_t outsize = (streamend - streamstart)*10;
char* output = new char [outsize]; ZeroMemory(output, outsize);
//Now use zlib to inflate:
z_stream zstrm; ZeroMemory(&zstrm, sizeof(zstrm));
zstrm.avail_in = streamend - streamstart + 1;
zstrm.avail_out = outsize;
zstrm.next_in = (Bytef*)(buffer + streamstart);
zstrm.next_out = (Bytef*)output;
int rsti = inflateInit(&zstrm);
if (rsti == Z_OK)
{
int rst2 = inflate (&zstrm, Z_FINISH);
if (rst2 >= 0)
{
//Ok, got something, extract the text:
size_t totout = zstrm.total_out;
ProcessOutput(fileo, output, totout);
}
}
delete[] output; output=0;
buffer+= streamend + 7;
filelen = filelen - (streamend+7);
}
else
{
morestreams = false;
}
}
fclose(filei);
}
if (fileo) fclose(fileo);
return 0;
}
xjyang
|
|
|
|
|
Can we open this project by using VS2003. What are the steps?
I just need a .NET component to convert PDF to TXT.
shammie.lk
|
|
|
|
|
Hi.... i used ur application, it was so great... superb.... but for some pdf files i didnt got any output, bcoz it is generating a empty file.... i dont know where is the problem? i feel it is in the compression format, but in my file it is default compression only.... can u give me any suggution
DreamUth
|
|
|
|
|
Thank NeWi for this article. It is very useful and save me a lot of time.
However, some of the PDF files I am trying to convert come up as garbage. Interesting enough, I do find some readable characters like this among the jibbish after the pdf is decompressed by the zlib module, so that means the decompression should be at least partially correct.
"URW Software, Copyright 1997 by URW. See the file COPYING (GNU General Public License) for license conditions. As a special exception, permission is granted to include this font program in a Postscript or PDF file that consists of a document that contains text to be displayed or printed using this font, regardless of the conditions or license applying to the document itself.Standard Symbols LCopyright URW Software, Copyright 1997 by URW "
NeWi mentioned that the newer PDF files may be encrypted and an updated version will be developed. Did anybody get the updated version? Would you mind post it here or email it to me? Many thanks!
I examined those failed PDFs, the trailer is something like this
trailer
<< /Size 39 /Root 1 0 R /Info 2 0 R
/ID [(†¾•®ïT*Yý8J´<æ)(†¾•®ïT*Yý8J´<æ)]
>>
startxref
42616
%%EOF
which doesn't specify it is encrypted...Does it mean that it doesn't fall into the catergory NeWi was mentioning?
|
|
|
|
|