Code to extract plain text from a PDF file

Getting the Blank Command prompt.Please check the following log for that.

'ex4.exe': Loaded 'C:\Users\177741\Documents\Visual Studio 2008\Projects\ex4\Debug\ex4.exe', Symbols loaded.
'ex4.exe': Loaded 'C:\Windows\System32\ntdll.dll'
'ex4.exe': Loaded 'C:\Windows\System32\kernel32.dll'
'ex4.exe': Loaded 'C:\Users\177741\Documents\Visual Studio 2008\Projects\ex4\Debug\zlib1.dll', Binary was not built with debug information.
'ex4.exe': Loaded 'C:\Windows\System32\msvcrt.dll'
'ex4.exe': Loaded 'C:\Windows\winsxs\x86_microsoft.vc90.debugcrt_1fc8b3b9a1e18e3b_9.0.21022.8_none_96748342450f6aa2\msvcr90d.dll'
The program '[4312] ex4.exe: Native' has exited with code 0 (0x0).

fine code,but in english
in french and other languages
with diacritic characters returns
numeric codes
could suugest a fix ?

unit pdftotext;

{$mode objfpc}{$H+}

//Converted from http://www.codeproject.com/KB/cpp/ExtractPDFText.aspx
//Original author http://www.codeproject.com/Members/NeWi
//Converted by Domingo Alvarez Duarte mingodadATgmailDOTcom
//Original source file pdf.cpp

//This file contains extremely crude pascal source code to extract plain text
//from a PDF file. It is only intended to show some of the basics involved
//in the process and by no means good enough for commercial use.
//But it can be easily modified to suit your purpose. Code is by no means
//warranted to be bug free or suitable for any purpose.

//Adobe has a web site that converts PDF files to text for free,
//so why would you need something like this? Several reasons:

//1) This code is entirely free including for commericcial use. It only
//   requires PAZLIB which is entirely free as well.

//2) This code tries to put tabs into appropriate places in the text,
//   which means that if your PDF file contains mostly one large table,
//   you can easily take the output of this program and directly read it
//   into Excel! Otherwise if you select and copy the text and paste it into
//   Excel there is no way to extract the various columns again.

//This code assumes that the PDF file has text objects compressed
//using FlateDecode (which seems to be standard).

//This code is free. Use it for any purpose.
//The author assumes no liability whatsoever for the use of this code.
//Use it at your own risk!

//PDF file strings (based on PDFReference15_v5.pdf from www.adobve.com:

//BT = Beginning of a text object, ET = end of a text object
//5 Ts = superscript
//-5 Ts = subscript
//Td move to start next line

interface

uses
   Classes;

function pdf2text(pdfFN: string): boolean;
function pdfStream2textStream(mStreamIn, mStreamOut: TMemoryStream): boolean;

implementation

uses
   SysUtils, paszlib;

const
   cStream = 'stream';
   cEndStream = 'endstream';
   cCR      = #13;
   cNL      = #10;
   cCRNL   = cCR + cNL;
   cTab      = #9;
   cBlanks = [' ', cCR, cNL];
   cDigits = ['0'..'9'];
   cDigitsDot = ['0'..'9','.'];

//Find a string in a buffer:
function FindStringInBuffer(buffer, search: PChar; buffersize: integer): integer;
var
   buffer0: PChar;
   len, i:   integer;
   fnd:      boolean;
begin
   buffer0 := buffer;

   len := strlen(search);
   fnd := False;
   while not fnd do
   begin
      fnd := True;
      for i := 0 to len - 1 do
      begin
         if (buffer[i] <> search[i]) then
         begin
            fnd := False;
            break;
         end;
      end;
      if (fnd) then
         exit(buffer - buffer0);
      Inc(buffer);
      if ((buffer - buffer0 + len) >= buffersize) then
         exit(-1);
   end;
   Result := -1;
end;

//Keep this many previous recent characters for back reference:
//#define oldchar 15
const
   cOldChar = 15;

//Convert a recent set of characters into a number if there is one.
//Otherwise return -1:
function ExtractNumber(search: PChar; lastCharOffset: integer): real;
var
   iStart, iEnd:         integer;
   buffer: array[0..(cOldChar + 5)] of char;
begin
   iEnd := lastcharoffset;
   while (iEnd > 0) and not (search[iEnd] in cDigits) do
      Dec(iEnd);
   iStart := iEnd;
   while (iStart > 0) and (search[iStart] in cDigitsDot) do
      Dec(iStart);
   Result := -1.0;
   FillChar(buffer, 0, sizeof(buffer));
   strlcopy(buffer, search + iStart + 1, iEnd - iStart);
   if (buffer[0] <> #0) then result := StrToFloatDef(buffer, -1);
end;

//This method processes an uncompressed Adobe (text) object and extracts text.
procedure ProcessOutput(oStream: TStream; output: PChar; len: integer);
var
   inTextObject, nextLiteral: boolean;
   rbdepth, j, i: integer;
   oc:   array[0..cOldChar] of char;
   c:   char;
   num: real;

   //Check if a certain 2 character token just came along (e.g. BT):
   function seen2(search, recent: PChar): boolean;
   begin
      Result := (recent[cOldChar - 3] = search[0]) and
         (recent[cOldChar - 2] = search[1]) and
         (recent[cOldChar - 1] in cBlanks) and
         (recent[cOldChar - 4] in cBlanks);
   end;

begin
   //writeln(output);
   //Are we currently inside a text object?
   inTextObject := False;

   //Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
   nextLiteral := False;

   //() Bracket nesting level. Text appears inside ()
   rbdepth := 0;

   //Keep previous chars to get extract numbers etc.:
   for j := 0 to cOldChar - 1 do
      oc[j] := ' ';

   for i := 0 to len - 1 do
   begin
      c := output[i];
      if (inTextObject) then
      begin
         if (rbdepth = 0) and seen2('TD', oc) then
         begin
            //Positioning.
            //See if a new line has to start or just a tab:
            num := ExtractNumber(oc, cOldChar - 5);
            if (num > 1.0) then
               oStream.Write(cCRNL, 2);
            if (num < 1.0) then
               oStream.Write(cTab, 1);
         end;
         if (rbdepth = 0) and seen2('ET', oc) then
         begin
            //End of a text object, also go to a new line.
            inTextObject := False;
            oStream.Write(cCRNL, 2);
         end
         else if (c = '(') and (rbdepth = 0) and (not nextLiteral) then
         begin
            //Start outputting text!
            rbdepth := 1;
            //See if a space or tab (>1000) is called for by looking
            //at the number in front of (
            num      := ExtractNumber(oc, cOldChar - 1);
            if (num > 0) then
            begin
               if (num > 1000.0) then
                  oStream.Write(cTab, 1)
               else if (num > 100.0) then
                  oStream.Write(' ', 1);
            end;
         end
         else if (c = ')') and (rbdepth = 1) and (not nextLiteral) then
         begin
            //Stop outputting text
            rbdepth := 0;
         end
         else if (rbdepth = 1) then
         begin
            //Just a normal text character:
            if (c = '\') and (not nextLiteral) then
            begin
               //Only print out next character no matter what. Do not interpret.
               nextliteral := True;
            end
            else
            begin
               nextliteral := False;
               if ((c >= ' ') and (c <= '~')) or
                  ((Byte(c) >= 128) and (Byte(c) < 255)) then
               begin
                  oStream.Write(c, 1);
               end;
            end;
         end;
      end;
      //Store the recent characters for when we have to go back for a number:
      for j := 0 to cOldChar - 2 do
         oc[j] := oc[j + 1];
      oc[cOldChar - 1] := c;
      if not inTextObject then
      begin
         if seen2('BT', oc) then
         begin
            //Start of a text object:
            inTextObject := True;
         end;
      end;
   end;
end;

function pdfStream2textStream(mStreamIn, mStreamOut: TMemoryStream): boolean;
var
   moreStreams: boolean;
   streamStart, streamEnd, nextStreamStart, filelen, outsize, i: integer;
   buffer, output: PChar;
   zstrm: TZstream;
begin
   buffer   := PChar(mStreamIn.Memory);
   filelen := mStreamIn.Size;
   output   := nil;
   outsize := 0;

   moreStreams := True;
   //Now search the buffer repeated for streams of data:
   while moreStreams do
   begin
      //Search for stream, endstream. We ought to first check the filter
      //of the object to make sure it if FlateDecode, but skip that for now!
      streamStart := FindStringInBuffer(buffer, cStream, filelen);
      streamEnd   := FindStringInBuffer(buffer, cEndStream, filelen);
      nextStreamStart := streamEnd + sizeof(cEndStream) + 1;
      if (streamStart > 0) and (streamEnd > streamStart) then
      begin
         //Skip to beginning and end of the data stream:
         Inc(streamStart, sizeof(cStream) {6});

         if (buffer[streamStart] = cCR {0x0d}) and
            (buffer[streamstart + 1] = cNL {0x0a}) then
            Inc(streamStart, 2)
         else if (buffer[streamstart] = cNL {0x0a}) then
            Inc(streamStart);

         if (buffer[streamend - 2] = cCR {0x0d}) and
            (buffer[streamend - 1] = cNL {0x0a}) then
            Dec(streamEnd, 2)
         else if (buffer[streamend - 1] = cNL {0x0a}) then
            Dec(streamEnd);

         //Assume output will fit into 10 times input buffer:
         i := (streamEnd - streamStart) * 10;
         if i > outsize then
         begin
            ReAllocMem(output, i);
            outsize := i;
         end;
         FillChar(output, 0, outsize);

         //Now use zlib to inflate:
         //z_stream zstrm; ZeroMemory(&zstrm, sizeof(zstrm));
         FillChar(zstrm, 0, SizeOf(zstrm));

         zstrm.avail_in   := streamEnd - streamStart + 1;
         zstrm.avail_out := outsize;
         zstrm.next_in   := PByte(buffer + streamstart);
         zstrm.next_out   := Pbyte(output);

         if (inflateInit(zstrm) = Z_OK) and
            (inflate(zstrm, Z_FINISH) >= 0) then
            //Ok, got something, extract the text:
            ProcessOutput(mStreamOut, output, zstrm.total_out);

         Inc(buffer, nextStreamStart);
         Dec(filelen, nextStreamStart);
      end
      else
         morestreams := False;
   end;
   FreeMem(output);
   Result := True;
end;

function pdf2text(pdfFN: string): boolean;
var
   mStreamIn, mStreamOut: TMemoryStream;
begin
   mStreamIn   := TMemoryStream.Create;
   mStreamOut := TMemoryStream.Create;
   //Read the entire file into memory (!):
   mStreamIn.LoadFromFile(pdfFN);
   Result := pdfStream2textStream(mStreamIn, mStreamOut);
   mStreamIn.Free;
   mStreamOut.SaveToFile(pdfFN + '.txt');
   mStreamOut.Free;
end;

end.

modified on Thursday, July 30, 2009 6:07 AM

unit pdftotext;

{$mode objfpc}{$H+}

//Converted from http://www.codeproject.com/KB/cpp/ExtractPDFText.aspx
//Original author http://www.codeproject.com/Members/NeWi
//Converted by Domingo Alvarez Duarte mingodadATgmailDOTcom
//Original source file pdf.cpp

//This file contains extremely crude pascal source code to extract plain text
//from a PDF file. It is only intended to show some of the basics involved
//in the process and by no means good enough for commercial use.
//But it can be easily modified to suit your purpose. Code is by no means
//warranted to be bug free or suitable for any purpose.

//Adobe has a web site that converts PDF files to text for free,
//so why would you need something like this? Several reasons:

//1) This code is entirely free including for commericcial use. It only
//   requires PAZLIB which is entirely free as well.

//2) This code tries to put tabs into appropriate places in the text,
//   which means that if your PDF file contains mostly one large table,
//   you can easily take the output of this program and directly read it
//   into Excel! Otherwise if you select and copy the text and paste it into
//   Excel there is no way to extract the various columns again.

//This code assumes that the PDF file has text objects compressed
//using FlateDecode (which seems to be standard).

//This code is free. Use it for any purpose.
//The author assumes no liability whatsoever for the use of this code.
//Use it at your own risk!

//PDF file strings (based on PDFReference15_v5.pdf from www.adobve.com:

//BT = Beginning of a text object, ET = end of a text object
//5 Ts = superscript
//-5 Ts = subscript
//Td move to start next line

interface

uses
   Classes;

function pdf2text(pdfFN: string): boolean;
function pdfStream2textStream(mStreamIn, mStreamOut: TMemoryStream): boolean;

implementation

uses
   SysUtils, paszlib;

const
   cStream = 'stream';
   cEndStream = 'endstream';
   cCR      = #13;
   cNL      = #10;
   cCRNL   = cCR + cNL;
   cTab      = #9;
   cBlanks = [' ', cCR, cNL];
   cDigits = ['0'..'9'];
   cDigitsDot = ['0'..'9','.'];
   c64KB = 60 * 1024;

//Find a string in a buffer:
function FindStringInBuffer(buffer, search: PChar; buffersize: integer): integer;
var
   buffer0: PChar;
   len, i:   integer;
   fnd:      boolean;
begin
   buffer0 := buffer;

   len := strlen(search);
   fnd := False;
   while not fnd do
   begin
      fnd := True;
      for i := 0 to len - 1 do
      begin
         if (buffer[i] <> search[i]) then
         begin
            fnd := False;
            break;
         end;
      end;
      if (fnd) then
         exit(buffer - buffer0);
      Inc(buffer);
      if ((buffer - buffer0 + len) >= buffersize) then
         exit(-1);
   end;
   Result := -1;
end;

//Keep this many previous recent characters for back reference:
//#define oldchar 15
const
   cOldChar = 15;

//This method processes an uncompressed Adobe (text) object and extracts text.
procedure ProcessOutput(oStream: TStream; output: PChar; len: integer);
var
   inTextObject, nextLiteral: boolean;
   rbdepth, iPos, lineLen, iLastCharPos, iTextFragments: integer;
   c, lastC:   char;
   num: real;

   //Check if a certain 2 character token just came along (e.g. BT):
   function seen2(search: PChar): boolean;
   begin
      Result := (iPos > 4) and
         (output[iPos - 4] = search[0]) and
         (output[iPos - 3] = search[1]) and
         (output[iPos - 2] in cBlanks) and
         (output[iPos - 5] in cBlanks);
   end;

   procedure writeCharOut(c: char);
   begin
      lastC := c;
      if lastC = cNL then lineLen := 0
      else Inc(lineLen);
      oStream.Write(lastC, 1);
   end;

begin
   //writeln(output);

   //Are we currently inside a text object?
   inTextObject := False;

   //Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
   nextLiteral := False;

   //() Bracket nesting level. Text appears inside ()
   rbdepth := 0;

   //Last char sent to the output
   lastC := #0;
   iLastCharPos := 0;
   iTextFragments := 0;

   //Current line lenght sent to output
   lineLen := 0;

   for iPos := 0 to len - 1 do
   begin
      c := output[iPos];
      if (inTextObject) then
      begin
         if (rbdepth = 0) and seen2('TD') then
         begin
            //Positioning.
            //See if a new line has to start or just a tab:
         end;
         if (rbdepth = 0) and seen2('ET') then
         begin
            //End of a text object, also go to a new line.
            inTextObject := False;
            writeCharOut(cCR);
            writeCharOut(cNL);
            iTextFragments := 0;
         end
         else if (c = '(') and (rbdepth = 0) and (not nextLiteral) then
         begin
            //Start outputting text!
            rbdepth := 1;
            inc(iTextFragments);
            // see if the last sent char is an space
            if (iTextFragments > 1) then
            begin
               (*//when line length bigger than x
               if (lineLen > 80) then
               begin
                  writeCharOut(cCR);
                  writeCharOut(cNL);
               end
               else*)
               //When 2 or more line exist a space betwen then
               if(output[iLastCharPos] <> ' ') then writeCharOut(' ');
            end;
         end
         else if (c = ')') and (rbdepth = 1) and (not nextLiteral) then
         begin
            //Stop outputting text
            rbdepth := 0;
         end
         else if (rbdepth = 1) then
         begin
            //Just a normal text character:
            if (c = '\') and (not nextLiteral) then
            begin
               //Only print out next character no matter what. Do not interpret.
               nextliteral := True;
            end
            else
            begin
               nextliteral := False;
               if ((c >= ' ') and (c <= '~')) or
                  ((Byte(c) >= 128) and (Byte(c) < 255)) then
               begin
                  writeCharOut(c);
                  iLastCharPos := iPos;
               end;
            end;
         end;
      end;
      //Store the recent characters for when we have to go back for a number:
      if not inTextObject then
      begin
         if seen2('BT') then
         begin
            //Start of a text object:
            inTextObject := True;
         end;
      end;
   end;
end;

function pdfStream2textStream(mStreamIn, mStreamOut: TMemoryStream): boolean;
var
   moreStreams: boolean;
   streamStart, streamEnd, nextStreamStart, filelen, outsize, newoutsize: integer;
   buffer, output: PChar;
   zstrm: TZstream;
begin
   buffer   := PChar(mStreamIn.Memory);
   filelen := mStreamIn.Size;
   output   := nil;
   outsize := 0;

   moreStreams := True;
   //Now search the buffer repeated for streams of data:
   while moreStreams do
   begin
      //Search for stream, endstream. We ought to first check the filter
      //of the object to make sure it if FlateDecode, but skip that for now!
      streamStart := FindStringInBuffer(buffer, cStream, filelen);
      streamEnd   := FindStringInBuffer(buffer, cEndStream, filelen);
      nextStreamStart := streamEnd + sizeof(cEndStream) + 1;
      if (streamStart > 0) and (streamEnd > streamStart) then
      begin
         //Skip to beginning and end of the data stream:
         Inc(streamStart, sizeof(cStream) {6});

         if (buffer[streamStart] = cCR {0x0d}) and
            (buffer[streamstart + 1] = cNL {0x0a}) then
            Inc(streamStart, 2)
         else if (buffer[streamstart] = cNL {0x0a}) then
            Inc(streamStart);

         if (buffer[streamend - 2] = cCR {0x0d}) and
            (buffer[streamend - 1] = cNL {0x0a}) then
            Dec(streamEnd, 2)
         else if (buffer[streamend - 1] = cNL {0x0a}) then
            Dec(streamEnd);

         //Assume output will fit into 10 times input buffer:
         //newoutsize := (streamEnd - streamStart) * 10;

         // most of the time the streams we will work with
         // aren't bigger than 65 KB after inflate
         newoutsize := c64KB;

         // Inside this loop we will adjust the memory needed
         while True do
         begin
            if newoutsize > outsize then
            begin
               ReAllocMem(output, newoutsize);
               outsize := newoutsize;
            end;
            //FillChar(output, 0, outsize);

            //Now use zlib to inflate:
            //z_stream zstrm; ZeroMemory(&zstrm, sizeof(zstrm));
            FillChar(zstrm, 0, SizeOf(zstrm));

            zstrm.avail_in   := streamEnd - streamStart + 1;
            zstrm.avail_out := outsize;
            zstrm.next_in   := PByte(buffer + streamstart);
            zstrm.next_out   := Pbyte(output);

            if (inflateInit(zstrm) = Z_OK) and
               (inflate(zstrm, Z_FINISH) >= 0) then
            begin
               if zstrm.total_out = outsize then
               begin
                  //we haven't allocated enough space
                  newoutsize := outsize * 2;
                  // once a new value is set let's start again
                  inflateEnd(zstrm);
                  continue;
               end;
               //writeln(newoutsize,':', newoutsize div 10 ,':', zstrm.total_out,':', outsize);
               if zstrm.total_out > 0 then
               begin
                  //Ok, got something, extract the text:
                  ProcessOutput(mStreamOut, output, zstrm.total_out);
               end;
            end;
            inflateEnd(zstrm);
            break;
         end;
         Inc(buffer, nextStreamStart);
         Dec(filelen, nextStreamStart);
      end
      else
         morestreams := False;
   end;
   FreeMem(output);
   Result := True;
end;

function pdf2text(pdfFN: string): boolean;
var
   mStreamIn, mStreamOut: TMemoryStream;
begin
   mStreamIn   := TMemoryStream.Create;
   mStreamOut := TMemoryStream.Create;
   //Read the entire file into memory (!):
   mStreamIn.LoadFromFile(pdfFN);
   Result := pdfStream2textStream(mStreamIn, mStreamOut);
   mStreamIn.Free;
   mStreamOut.SaveToFile(pdfFN + '.txt');
   mStreamOut.Free;
end;

initialization

end.

Since I'm trying to use this in a recursion process, i.e. converting multiple pdf files. I have tried to use delete[] to delete the buffer so that I can free the memory after extracting the whole file.

I'm getting assertion errors

please help

it works great, i even made a linux version it was very easy just had to change ZeroMemory with memset. 10x alot

I don't suppose anyone knows of a .Net conversion of this (VB, C#, C++, I'm not picky).

Extract Text from PDF in C# (100% .NET)[^]

Hi NeWi,

I'm creating an compression application using the zlib, i have included the zlib.h in the header files. I'm facing a linker error

"error LNK2019: unresolved external symbol _deflateInit_ referenced in function "protected: static bool __cdecl Compressor::ZlibCompress"

when i call the ret = deflateInit(&zstm, Z_BEST_COMPRESSION)

Helpfull if solution provided ASAP.

Thanks for replying,
Mayur M

hi I had the same problem.
I added zdll.lib and zlib1.dll to "Resource Files" folder of the project workspace sideview (in vc++ 6) then things went better.
I hope this will work for you too.

Well i am a beginner and I want to develop this project. So could you please provide me with a step by step guideline (procedure) of implementation of this code like in which version of C++ do I have to write this code? , How to make an application for this? etc. Please help me out with a procedure ( step by step ) starting right from basic step . Confused | :confused:

Hello. I want to recognize the pages within the pdf document. So either to exract page1, page2,.... lastpage. Or in a way that I can get a plain text like this one:

<newpage>
John said to Sally....
<newpage>
And he also told her...
<newpage>
and the story goes on...
<newpage>
Till the end, no new page after this. Wink | ;-)

Alexander,

I am no PDF guru but have been looking at PDF's heavily over the last few days and might be able to help!
From what I've seen is that in the PDF (depending on formatting I guess?) you have your streams i.e. objects that are text. Then directly after these streams you have more objects that define your page setup. From what I've seen this is as follows. First you have an object that is defined like below (basically number of pages):

225 0 obj
<< /Type /Pages
   /Kids [
    226 0 R
    227 0 R
    228 0 R
    229 0 R
     ]
   /Count 4
>>
endobj

Next you have each page in turn and within this it looks as though you 'call' the previously defined streams (or any other objects) to appear as the CONTENTS on these pages i.e:

'''''Page 1'''''

226 0 obj
<< /Type /Page
   /Parent 225 0 R
   /MediaBox [0 0 595 841 ]
   /Contents [ 
  7 0 R
  9 0 R
  11 0 R
  13 0 R
  15 0 R
  17 0 R
  19 0 R
  21 0 R
  23 0 R
  25 0 R
  27 0 R
  29 0 R
  31 0 R
  33 0 R
  35 0 R
  37 0 R
             ]
   /Annots [ 
  72 0 R
             ]
   /Resources << 
   /ProcSet 1 0 R
   /Font << /F1 2 0 R
            /F2 3 0 R
            /F3 4 0 R
            /F4 5 0 R >>
              >>
>>
endobj

'''''Page 2'''''

227 0 obj
<< /Type /Page
   /Parent 225 0 R
   /MediaBox [0 0 595 841 ]
   /Contents [ 
  7 0 R
  9 0 R
  81 0 R
  83 0 R
  85 0 R
  87 0 R
  89 0 R
  91 0 R
  93 0 R
  95 0 R
  97 0 R
  99 0 R
  101 0 R
             ]
   /Annots [ 
             ]
   /Resources << 
   /ProcSet 1 0 R
   /Font << /F1 2 0 R
            /F2 3 0 R
            /F3 4 0 R
            /F4 5 0 R >>
              >>
>>
endobj

So I imagine you could write some code that looks for the objects on a certain page?? I hope this could help you in some way! Good Luck!

I am doing an important project. Do you have the code that can read a PDF file containing a table and store the table in EXCELL or notepad or .doc

Leon

Thanks for this code. It saved me TONS of time. There is a bug though. Need to check for zero lenght streams (seems odd but I have encountered this). When streamend < streamstart just set streamstart to equal streamend.

Also, I am noticing that some pdf docs do not decode numbers correctly (while text decodes fine in same doc...)

Thanks again!

Not sure what you mean: the code already has

if (streamstart>0 && streamend>streamstart)<br />
{<br />
...<br />
}<br />

thus if streamend would be smaller than streamstart it wouldn't execute that part anyway?!

you application save my work Smile | :)

I get TAB (09) instead of the russian letters.
May be I remove "stdafx.h" from code?

modified on Wednesday, May 14, 2008 2:47 AM

i am using VC++ version 6.0. i my project i need to extract text from a pdg page, i downloaded the code ,it seems very usefull to me. but i am getting Error as : unresolved external symbol __endthreadex.
Can anyone help me for solving it.
Advance Thanks...

Hi all,

I hope that someone can solve my problem. Now, I writing the MFC window application.
My application have some data in the textbox. I want to save that textbox's data to somewhere by text file. I don't know how to output the text file.

If someone know, pls help me.

Regards,
tun

First of all a big thaks to NeWi
You piece of code helped me a lot.

The memory leak in pdf.cpp

at line 223:
char* buffer = new char [filelen]; (is never deleted)

bad pointer handling at line 270:
buffer+= streamend + 7;

this means that even if buffer is deleted there will
still be some memory left.

My solution:
char* bufferRoot = new char [filelen]; (is never deleted)
char* buffer = bufferRoot;

and then at the end of the if scoope in which the buffer
was created:
delete[] bufferRoot;

Thanks again NeWi

Best regards
Asger-P

The following is a simple change to fit the need to extract text in Chinese.
Note that this version is still not a complete version. please refer to the PDF specification to make the version complete.

=======================================================================
// PDFTest.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"

//This file contains extremely crude C source code to extract plain text
//from a PDF file. It is only intended to show some of the basics involved
//in the process and by no means good enough for commercial use.
//But it can be easily modified to suit your purpose. Code is by no means
//warranted to be bug free or suitable for any purpose.
//
//Adobe has a web site that converts PDF files to text for free,
//so why would you need something like this? Several reasons:
//
//1) This code is entirely free including for commericcial use. It only
// requires ZLIB (from www.zlib.org) which is entirely free as well.
//
//2) This code tries to put tabs into appropriate places in the text,
// which means that if your PDF file contains mostly one large table,
// you can easily take the output of this program and directly read it
// into Excel! Otherwise if you select and copy the text and paste it into
// Excel there is no way to extract the various columns again.
//
//This code assumes that the PDF file has text objects compressed
//using FlateDecode (which seems to be standard).
//
//This code is free. Use it for any purpose.
//The author assumes no liability whatsoever for the use of this code.
//Use it at your own risk!

//PDF file strings (based on PDFReference15_v5.pdf from www.adobve.com:
//
//BT = Beginning of a text object, ET = end of a text object
//5 Ts = superscript
//-5 Ts = subscript
//Td move to start next line

//No precompiled headers, but uncomment if need be:
#include "stdafx.h"

#include <stdio.h>
#include <windows.h>

//YOur project must also include zdll.lib (ZLIB) as a dependency.
//ZLIB can be freely downloaded from the internet, www.zlib.org
//Use 4 byte struct alignment in your project!

#include "zlib.h"

//Find a string in a buffer:
size_t FindStringInBuffer (char* buffer, char* search, size_t buffersize)
{
char* buffer0 = buffer;

size_t len = strlen(search);
bool fnd = false;
while (!fnd)
{
fnd = true;
for (size_t i=0; i<len; i++)
{
if (buffer[i]!=search[i])
{
fnd = false;
break;
}
}
if (fnd) return buffer - buffer0;
buffer = buffer + 1;
if (buffer - buffer0 + len >= buffersize) return -1;
}
return -1;
}

//Keep this many previous recent characters for back reference:
#define oldchar 15

//Convert a recent set of characters into a number if there is one.
//Otherwise return -1:
float ExtractNumber(const char* search, int lastcharoffset)
{
int i = lastcharoffset;
while (i>0 && search[i]==' ') i--;
while (i>0 && (isdigit(search[i]) || search[i]=='.')) i--;

// Added by XJ Yang:
if(search[i] == '-')
i--;
// Ended of Added by Xj Yang.

float flt=-1.0;
char buffer[oldchar+5]; ZeroMemory(buffer,sizeof(buffer));
strncpy(buffer, search+i+1, lastcharoffset-i);
if (buffer[0] && sscanf(buffer, "%f", &flt))
{
return flt;
}
return -1.0;
}

//Check if a certain 2 character token just came along (e.g. BT):
bool seen2(const char* search, char* recent)
{
if ( recent[oldchar-3]==search[0]
&& recent[oldchar-2]==search[1]
&& (recent[oldchar-1]==' ' || recent[oldchar-1]==0x0d || recent[oldchar-1]==0x0a)
&& (recent[oldchar-4]==' ' || recent[oldchar-4]==0x0d || recent[oldchar-4]==0x0a)
)
{
return true;
}
return false;
}

unsigned char TwoChars2Hex(unsigned char c1, unsigned char c2)
{
// 该函数将一个形如 'F', '3' 的两个字符, 看作一个16进制的两位, 转变为一个形如0xF3的字符:

unsigned char c;

if( isdigit(c1))
c = (c1 - '0');
else if( c1 >= 'a' && c1 <='z')
c = (c1 - 'a');
else if( c1 >= 'A' && c1 <='Z')
c = (c1 - 'A') + 10;
else
c = 0;

if( isdigit(c2))
c = c * 16 + (c2 - '0');
else if( c2 >= 'a' && c2 <='z')
c = c * 16 + (c2 - 'a');
else if( c2 >= 'A' && c2 <='Z')
c = c * 16 + (c2 - 'A') + 10;
else
c = c * 16;

return c;
}

//This method processes an uncompressed Adobe (text) object and extracts text.
void ProcessOutput(FILE* file, char* output, size_t len)
{
//Are we currently inside a text object?
bool intextobject = false;

//Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
bool nextliteral = false;

//() Round bracket nesting level. Text appears inside ()
int rbdepth = 0;

// Added by XJ Yang:
//<> Point bracket nesting level. Text appears inside <>
int pbdepth = 0;
// Ended of added by XJ Yang.

//Keep previous chars to get extract numbers etc.:
char oc[oldchar];
int j=0;
for (j=0; j<oldchar; j++) oc[j]=' ';

for (size_t i=0; i<len; i++)
{
// Changed by XJ Yang:
// char c = output[i];
unsigned char c = output[i];
// End of changed by XJ Yang.

if (intextobject)
{
if (rbdepth==0 && seen2("TD", oc))
{
//Positioning.
//See if a new line has to start or just a tab:
float num = ExtractNumber(oc,oldchar-5);

// Changed by XJ Yang:
/*
if (num>1.0)
{
fputc(0x0d, file);
fputc(0x0a, file);
}
if (num<1.0)
{
// Changed by XJ Yang:
// fputc('\t', file);
// End of changed by XJ Yang.
}
*/
// Ended of changed by Xj Yang.
if (num > 1.0)
{
fputc(0x0d, file);
fputc(0x0a, file);
}
}

// Added by XJ yang:
if (pbdepth==0 && seen2("Tw", oc))
{
float num = ExtractNumber(oc,oldchar-5);
if (num < 1.0 && num > 0)
{
fputc(0x0d, file);
fputc(0x0a, file);
}
}

// End of added by XJ Yang.

if (rbdepth==0 && seen2("ET", oc))
{
//End of a text object, also go to a new line.
intextobject = false;
fputc(0x0d, file);
fputc(0x0a, file);
}
else if (c=='(' && rbdepth==0 && !nextliteral)
{
//Start outputting text!
rbdepth=1;
//See if a space or tab (>1000) is called for by looking
//at the number in front of (
int num = (int) ExtractNumber(oc,oldchar-1);
if (num>0)
{
if (num>1000.0)
{
fputc('\t', file);
}
else if (num>100.0)
{
fputc(' ', file);
}
}
}
else if (c==')' && rbdepth==1 && !nextliteral)
{
//Stop outputting text
rbdepth=0;
}
else if (rbdepth==1)
{
//Just a normal text character:
if (c=='\\' && !nextliteral)
{
//Only print out next character no matter what. Do not interpret.
nextliteral = true;
}
else
{
nextliteral = false;
if ( ((c>=' ') && (c<='~')) || ((c>=128) && (c<255)) )
{
fputc(c, file);
}
}
}

// Added by XJ Yang:
else if (c=='<' && pbdepth==0 && !nextliteral)
{
//Start outputting text!
pbdepth=1;
//See if a space or tab (>1000) is called for by looking
//at the number in front of (
int num = (int) ExtractNumber(oc,oldchar-1);
if (num>0)
{
if (num>1000.0)
{
fputc('\t', file);
}
else if (num>100.0)
{
fputc(' ', file);
}
}
}
else if (c=='>' && pbdepth==1 && !nextliteral)
{
//Stop outputting text
pbdepth=0;
}
else if (pbdepth==1)
{
//Just a normal text character:
if (c=='\\' && !nextliteral)
{
//Only print out next character no matter what. Do not interpret.
nextliteral = true;
}
else
{
nextliteral = false;
if ( ((c>=' ') && (c<='~')) || ((c>=128) && (c<255)) ) // 是汉字：
{
unsigned char c2 = output[++i];
unsigned char combinedChar = TwoChars2Hex(c, c2);
fputc(combinedChar, file);
}
}
}
// End of added by Xj Yang.

}
//Store the recent characters for when we have to go back for a number:
for (j=0; j<oldchar-1; j++) oc[j]=oc[j+1];
oc[oldchar-1]=c;
if (!intextobject)
{
if (seen2("BT", oc))
{
//Start of a text object:
intextobject = true;
}
}
}
}

int main(int argc, char * argv[])
{
//Discard existing output:
FILE* fileo = fopen("output.txt", "w");
if (fileo) fclose(fileo);
fileo = fopen("output.txt", "a");

//Open the PDF source file:
FILE* filei = fopen("Some.pdf", "rb");

if (filei && fileo)
{
//Get the file length:
int fseekres = fseek(filei,0, SEEK_END); //fseek==0 if ok
long filelen = ftell(filei);
fseekres = fseek(filei,0, SEEK_SET);

//Read ethe ntire file into memory (!):
char* buffer = new char [filelen]; ZeroMemory(buffer, filelen);
size_t actualread = fread(buffer, filelen, 1 ,filei); //must return 1

bool morestreams = true;

//Now search the buffer repeated for streams of data:
while (morestreams)
{
//Search for stream, endstream. We ought to first check the filter
//of the object to make sure it if FlateDecode, but skip that for now!
size_t streamstart = FindStringInBuffer (buffer, "stream", filelen);
size_t streamend = FindStringInBuffer (buffer, "endstream", filelen);
if (streamstart>0 && streamend>streamstart)
{
//Skip to beginning and end of the data stream:
streamstart += 6;

if (buffer[streamstart]==0x0d && buffer[streamstart+1]==0x0a) streamstart+=2;
else if (buffer[streamstart]==0x0a) streamstart++;

if (buffer[streamend-2]==0x0d && buffer[streamend-1]==0x0a) streamend-=2;
else if (buffer[streamend-1]==0x0a) streamend--;

//Assume output will fit into 10 times input buffer:
size_t outsize = (streamend - streamstart)*10;
char* output = new char [outsize]; ZeroMemory(output, outsize);

//Now use zlib to inflate:
z_stream zstrm; ZeroMemory(&zstrm, sizeof(zstrm));

zstrm.avail_in = streamend - streamstart + 1;
zstrm.avail_out = outsize;
zstrm.next_in = (Bytef*)(buffer + streamstart);
zstrm.next_out = (Bytef*)output;

int rsti = inflateInit(&zstrm);
if (rsti == Z_OK)
{
int rst2 = inflate (&zstrm, Z_FINISH);
if (rst2 >= 0)
{
//Ok, got something, extract the text:
size_t totout = zstrm.total_out;
ProcessOutput(fileo, output, totout);
}
}
delete[] output; output=0;
buffer+= streamend + 7;
filelen = filelen - (streamend+7);
}
else
{
morestreams = false;
}
}
fclose(filei);
}
if (fileo) fclose(fileo);
return 0;
}

xjyang

Can we open this project by using VS2003. What are the steps?

I just need a .NET component to convert PDF to TXT.

shammie.lk

Hi.... i used ur application, it was so great... superb.... but for some pdf files i didnt got any output, bcoz it is generating a empty file.... i dont know where is the problem? i feel it is in the compression format, but in my file it is default compression only.... can u give me any suggution

DreamUth

Thank NeWi for this article. Big Grin | :-D

It is very useful and save me a lot of time.

However, some of the PDF files I am trying to convert come up as garbage. Interesting enough, I do find some readable characters like this among the jibbish after the pdf is decompressed by the zlib module, so that means the decompression should be at least partially correct.

"URW Software, Copyright 1997 by URW. See the file COPYING (GNU General Public License) for license conditions. As a special exception, permission is granted to include this font program in a Postscript or PDF file that consists of a document that contains text to be displayed or printed using this font, regardless of the conditions or license applying to the document itself.Standard Symbols LCopyright URW Software, Copyright 1997 by URW "

NeWi mentioned that the newer PDF files may be encrypted and an updated version will be developed. Did anybody get the updated version? Would you mind post it here or email it to me? Many thanks!

I examined those failed PDFs, the trailer is something like this
trailer
<< /Size 39 /Root 1 0 R /Info 2 0 R
/ID [(†¾•®ïT*Yý8J´<æ)(†¾•®ïT*Yý8J´<æ)]
>>
startxref
42616
%%EOF

which doesn't specify it is encrypted...Does it mean that it doesn't fall into the catergory NeWi was mentioning?

Code to extract plain text from a PDF file

Introduction

Why?

Basics

About Code

Using The Code

Future Enhancements

Code Snippets

License

Comments and Discussions