Click here to Skip to main content
65,938 articles
CodeProject is changing. Read more.
Articles
(untagged)

Decode /FlateDecode PDF Stream To Plain Text using ZLib Inflate Function in VB

0.00/5 (No votes)
9 Dec 2008 1  
Do you want to use zLIB Inflate function in VB? - This sample decodes a PDF (stream...endstream) to plain text. Extract plain text from PDF file

Introduction

This example shows how to use the ZLib Inflate & Deflate function in VB6. This example illustrates how to decode a PDF file to extract plain text.

Background 

This is similar to The Code Project article, Code to Extract Plain Text from a PDF File, but this project does not remove any internal PDF text. I leave the processing up to you. I might update this project later.

Using the Code 

Open the PDF file and get all bytes:

ReDim TheBytes(FileLen(filenam$) - 1)

Open filenam$ For Binary Access Read As #1
    Get #1, , TheBytes()
Close #1

'Convert the entire byte array to a string
sStr = StrConv(TheBytes, vbUnicode)

'Search for stream and endstream
lStart = InStr(1, sStr, "stream")
Do While lStart > 0
    lEnd = InStr(lStart, sStr, "endstream")
    If lEnd > 0 Then
        'Get the stream without the text Stream
        sStream = Mid(sStr, lStart + 6, lEnd - lStart - 6)
        
        'check if to remove the crlf after stream
        If Left(sStream, 2) = vbCrLf Then sStream = Mid(sStream, 3)

        'Convert this stream portion back to a byte array
        TheBytes = StrConv(sStream, vbFromUnicode)
        
        'Decode this portion
        Module4.UncompressData TheBytes, xbBufferOut
        
        'Display the results
        txtUncompressed = txtUncompressed & vbCrLf & vbCrLf & vbCrLf & _
					StrConv(xbBufferOut, vbUnicode)
        
        'Search the next stream where we left off
        lStart = InStr(lEnd + 8, sStr, "stream")
    Else: lStart = 0
    End If
Loop 

'This is the code for Module4 
 Option Explicit

 Private Const Z_FINISH As Long = 4

 Public Enum ZLIB_CompressionLevelConstants
    Z_NO_COMPRESSION = 0
    Z_BEST_SPEED = 1
    Z_BEST_COMPRESSION = 9
    Z_DEFAULT_COMPRESSION = (-1)
 End Enum

 Private Type zStream
    next_in As Long
    avail_in As Long
    total_in As Long
    next_out As Long
    avail_out As Long
    total_out As Long
    msg As Long
    state As Long
    zalloc As Long
    zfree As Long
    opaque As Long
    data_type As Long
    adler As Long
    reserved As Long
 End Type

 Private Declare Function ArrPtr Lib "msvbvm60.dll" _
			Alias "VarPtr" (Ptr() As Any) As Long
 Private Declare Sub CopyMemory Lib "kernel32.dll" Alias "RtlMoveMemory" _
	(ByRef Destination As Any, ByRef Source As Any, ByVal Length As Long)
 Private Declare Function deflate Lib "zlib.dll" _
	(vStream As zStream, Optional ByVal vflush As Long = Z_FINISH) As Long
 Private Declare Function deflateEnd Lib "zlib.dll" (vStream As zStream) As Long
 Private Declare Function deflateInit Lib "zlib.dll" Alias "deflateInit_" _
	(strm As zStream, ByVal Level As Long, ByVal version As String, _
	ByVal stream_size As Long) As Long
 Private Declare Function inflate Lib "zlib.dll" _
	(vStream As zStream, Optional ByVal vflush As Long = 1) As Long
 Private Declare Function inflateEnd Lib "zlib.dll" (vStream As zStream) As Long
 Private Declare Function inflateInit Lib "zlib.dll" Alias "inflateInit_" _
	(strm As zStream, ByVal version As String, ByVal stream_size As Long) As Long

 Private msVersion As String
 Private mnChunkSize As Long

Public Property Get ZLIB_ChunkSize() As Long
    If mnChunkSize = 0 Then
        mnChunkSize = &H10000
    End If
    ZLIB_ChunkSize = mnChunkSize
End Property
Public Property Let ZLIB_ChunkSize(ByVal Value As Long)
    mnChunkSize = Value
End Property

Public Property Get ZLIB_Version() As String
    If LenB(msVersion) = 0 Then
        msVersion = "1.1.2.0"
    End If
    ZLIB_Version = msVersion
End Property

Public Property Let ZLIB_Version(ByRef Value As String)
    msVersion = Value
End Property

Public Function CompressData(ByRef vxbInput() As Byte, _
	ByRef vxbOutput() As Byte, Optional vnStart As Long = 0, _
	Optional vnMaxSize As Long = 0, Optional veCompressionLevel _
	As ZLIB_CompressionLevelConstants = Z_DEFAULT_COMPRESSION) As Boolean
Dim tStream As zStream
Dim rc As Long
Dim xbCopy() As Byte
 With tStream
 
    If deflateInit(tStream, veCompressionLevel, ZLIB_Version, Len(tStream)) = 0 Then
        CompressData = True
        
        CopyMemory rc, ByVal ArrPtr(vxbInput), 4
        If rc Then
            CopyMemory .avail_in, ByVal rc + 16, 4
            .avail_in = .avail_in - vnStart
        End If
        If .avail_in > 0 And vnStart < .avail_in Then
            If vnMaxSize <> 0 And vnMaxSize < .avail_in Then
                .avail_in = vnMaxSize
            End If
            .next_in = VarPtr(vxbInput(vnStart))

            CopyMemory rc, ByVal ArrPtr(vxbOutput), 4
            If rc Then
                CopyMemory rc, ByVal rc + 12, 4

                If rc + vnStart = .next_in Then
                    xbCopy = vxbInput
                    .next_in = VarPtr(xbCopy(vnStart))
                ElseIf vnStart Then
                    ReDim vxbOutput(vnStart - 1)
                    CopyMemory vxbOutput(0), vxbInput(0), vnStart - 1
                End If
            Else
                vxbOutput = vxbInput
            End If

            .avail_out = .avail_in + 12
 
            ReDim Preserve vxbOutput(.total_out - 1 + .avail_out + vnStart)
 
            .next_out = VarPtr(vxbOutput(vnStart + .total_out))

            CompressData = deflate(tStream, 4) = 1

            If .total_out Or vnStart Then
                ReDim Preserve vxbOutput(.total_out + vnStart - 1)
            Else
                Erase vxbOutput
            End If
        End If

        deflateEnd tStream
    End If
 End With
End Function

 Public Function UncompressData(ByRef vxbInput() As Byte, _
	ByRef vxbOutput() As Byte, Optional vnStart As Long = 0, _
	Optional vnMaxSize As Long = 0, _
	Optional ByVal vnUncompressedSize As Long = 0) As Boolean
 Dim tStream As zStream
 Dim rc As Long
 
 Dim xbCopy() As Byte
    With tStream
        
        If inflateInit(tStream, ZLIB_Version, Len(tStream)) = 0 Then
            UncompressData = True
            
            CopyMemory rc, ByVal ArrPtr(vxbInput), 4
            If rc Then
                CopyMemory .avail_in, ByVal rc + 16, 4
                .avail_in = .avail_in - vnStart
            End If
            If .avail_in > 0 And vnStart < .avail_in Then
                
                If vnMaxSize <> 0 And vnMaxSize < .avail_in Then
                    .avail_in = vnMaxSize
                End If
                .next_in = VarPtr(vxbInput(vnStart))
    
                CopyMemory rc, ByVal ArrPtr(vxbOutput), 4
                If rc Then
                    CopyMemory rc, ByVal rc + 12, 4
    
                If rc + vnStart = .next_in Then
                    xbCopy = vxbInput
                    .next_in = VarPtr(xbCopy(vnStart))
                ElseIf vnStart Then
                    ReDim xbDataOut(vnStart - 1)
                    CopyMemory vxbOutput(0), vxbInput(0), vnStart - 1
                End If
            ElseIf vnStart Then
                vxbOutput = vxbInput
            End If
    
            If vnUncompressedSize Then
                .avail_out = vnUncompressedSize
            Else
                .avail_out = .avail_in * 2
            End If

            Do
                ReDim Preserve vxbOutput(.total_out - 1 + .avail_out + vnStart)
               
                .next_out = VarPtr(vxbOutput(vnStart + .total_out))
    
                rc = inflate(tStream)
                If rc Then
                    UncompressData = rc > 0
                    Exit Do
                End If
    
             
                .avail_out = ZLIB_ChunkSize
            Loop Until rc = 1
    
            If .total_out Or vnStart Then
                ReDim Preserve vxbOutput(.total_out + vnStart - 1)
            Else
                Erase vxbOutput
            End If
        End If
    
        inflateEnd tStream
    End If
 End With
End Function

History

  • 9th December, 2008: Initial post

I might update this project to show how to process the text.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here