| 
<?php
 
 function decodeAsciiHex ( $input )
 {
 $output = "";
 
 $isOdd = true;
 $isComment = false;
 
 for ( $i = 0, $codeHigh =  -1 ; $i < strlen ( $input ) && $input [ $i ] != '>' ; $i++  )
 {
 $c = $input [ $i ];
 
 if ( $isComment )
 {
 if ( $c == '\r' || $c == '\n' )
 $isComment = false;
 continue;
 }
 
 switch ( $c )
 {
 case '\0' :
 case '\t' :
 case '\r' :
 case '\f' :
 case '\n' :
 case ' ' :
 break;
 case '%' :
 $isComment = true;
 break;
 
 default :
 $code = hexdec ( $c );
 if ( $code === 0 && $c != '0' )
 return "";
 
 if ( $isOdd )
 $codeHigh = $code;
 else
 $output .= chr ( $codeHigh * 16 + $code );
 
 $isOdd =  !$isOdd;
 break;
 }
 }
 
 if ( $input [ $i ] != '>' )
 return "";
 
 if ( $isOdd )
 $output .= chr ( $codeHigh * 16 );
 
 return $output;
 
 
 }
 
 
 function decodeAscii85 ( $input )
 {
 $output = "";
 
 $isComment = false;
 $ords = array ();
 
 for ( $i = 0, $state = 0 ; $i < strlen ( $input ) && $input [ $i ] != '~' ; $i++  )
 {
 $c = $input [ $i ];
 
 if ( $isComment )
 {
 if ( $c == '\r' || $c == '\n' )
 $isComment = false;
 continue;
 }
 
 if ( $c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ' )
 continue;
 if ( $c == '%' )
 {
 $isComment = true;
 continue;
 }
 if ( $c == 'z' && $state === 0 )
 {
 $output .= str_repeat ( chr ( 0 ), 4 );
 continue;
 }
 if ( $c < '!' || $c > 'u' )
 return "";
 
 $code = ord ( $input [ $i ] ) & 0xff;
 $ords [ $state++  ] = $code - ord ( '!' );
 
 if ( $state == 5 )
 {
 $state = 0;
 for ( $sum = 0, $j = 0 ; $j < 5 ; $j++  )
 $sum = $sum * 85 + $ords [ $j ];
 for ( $j = 3 ; $j >= 0 ; $j--  )
 $output .= chr ( $sum >> ( $j * 8 ) );
 }
 }
 if ( $state === 1 )
 return "";
 elseif ( $state > 1 )
 {
 for ( $i = 0, $sum = 0 ; $i < $state ; $i++  )
 $sum += ( $ords [ $i ] + ( $i == $state - 1 ) ) * pow ( 85, 4 - $i );
 for ( $i = 0 ; $i < $state - 1 ; $i++  )
 $ouput .= chr ( $sum >> ( ( 3 - $i ) * 8 ) );
 }
 
 return $output;
 
 
 }
 
 
 function decodeFlate ( $input )
 {
 return @gzuncompress ( $input );
 
 
 }
 
 
 function getObjectOptions ( $object )
 {
 $options = array ();
 if ( preg_match ( "#<<(.*)>>#ismU", $object, $options ) )
 {
 $options = explode ( "/", $options [ 1 ] );
 @array_shift ( $options );
 
 $o = array ();
 for ( $j = 0 ; $j < @count ( $options ) ; $j++  )
 {
 $options [ $j ] = preg_replace ( "#\s+#", " ", trim ( $options [ $j ] ) );
 if ( strpos ( $options [ $j ], " " ) !== false )
 {
 $parts = explode ( " ", $options [ $j ] );
 $o [ $parts [ 0 ] ] = $parts [ 1 ];
 }
 else
 $o [ $options [ $j ] ] = true;
 }
 $options = $o;
 unset ( $o );
 }
 
 return $options;
 
 
 }
 
 
 function getDecodedStream ( $stream, $options )
 {
 $data = "";
 if ( empty ( $options [ "Filter" ] ) )
 $data = $stream;
 else
 {
 $length =  !empty ( $options [ "Length" ] ) ? $options [ "Length" ] : strlen ( $stream );
 $_stream = substr ( $stream, 0, $length );
 
 foreach ( $options as $key => $value )
 {
 if ( $key == "ASCIIHexDecode" )
 $_stream = decodeAsciiHex ( $_stream );
 if ( $key == "ASCII85Decode" )
 $_stream = decodeAscii85 ( $_stream );
 if ( $key == "FlateDecode" )
 $_stream = decodeFlate ( $_stream );
 }
 $data = $_stream;
 }
 return $data;
 
 
 }
 
 
 function getDirtyTexts ( &$texts, $textContainers )
 {
 for ( $j = 0 ; $j < count ( $textContainers ) ; $j++  )
 {
 if ( preg_match_all ( "#\[(.*)\]\s*TJ#ismU", $textContainers [ $j ], $parts ) )
 $texts = array_merge ( $texts, @$parts [ 1 ] );
 elseif ( preg_match_all ( "#Td\s*(\(.*\))\s*Tj#ismU", $textContainers [ $j ], $parts ) )
 $texts = array_merge ( $texts, @$parts [ 1 ] );
 }
 
 
 }
 
 
 function getCharTransformations ( &$transformations, $stream )
 {
 preg_match_all ( "#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER );
 preg_match_all ( "#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER );
 
 for ( $j = 0 ; $j < count ( $chars ) ; $j++  )
 {
 $count = $chars [ $j ] [ 1 ];
 $current = explode ( "\n", trim ( $chars [ $j ] [ 2 ] ) );
 for ( $k = 0 ; $k < $count && $k < count ( $current ) ; $k++  )
 {
 if ( preg_match ( "#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim ( $current [ $k ] ), $map ) )
 $transformations [ str_pad ( $map [ 1 ], 4, "0" ) ] = $map [ 2 ];
 }
 }
 for ( $j = 0 ; $j < count ( $ranges ) ; $j++  )
 {
 $count = $ranges [ $j ] [ 1 ];
 $current = explode ( "\n", trim ( $ranges [ $j ] [ 2 ] ) );
 for ( $k = 0 ; $k < $count && $k < count ( $current ) ; $k++  )
 {
 if ( preg_match ( "#<([0-9a-f]{1,4})>\s+<([0-9a-f]{1,4})>\s+<([0-9a-f]{1,4})>#is", trim ( $current [ $k ] ), $map ) )
 {
 $from = hexdec ( $map [ 1 ] );
 $to = hexdec ( $map [ 2 ] );
 $_from = hexdec ( $map [ 3 ] );
 
 for ( $m = $from, $n = 0 ; $m <= $to ; $m++ , $n++  )
 $transformations [ sprintf ( "%04X", $m ) ] = sprintf ( "%04X", $_from + $n );
 }
 elseif ( preg_match ( "#<([0-9a-f]{1,4})>\s+<([0-9a-f]{1,4})>\s+\[(.*)\]#ismU", trim ( $current [ $k ] ), $map ) )
 {
 $from = hexdec ( $map [ 1 ] );
 $to = hexdec ( $map [ 2 ] );
 $parts = preg_split ( "#\s+#", trim ( $map [ 3 ] ) );
 
 for ( $m = $from, $n = 0 ; $m <= $to && $n < count ( $parts ) ; $m++ , $n++  )
 $transformations [ sprintf ( "%04X", $m ) ] = sprintf ( "%04X", hexdec ( $parts [ $n ] ) );
 }
 }
 }
 
 
 }
 
 
 function getTextUsingTransformations ( $texts, $transformations )
 {
 $document = "";
 for ( $i = 0 ; $i < count ( $texts ) ; $i++  )
 {
 $isHex = false;
 $isPlain = false;
 
 $hex = "";
 $plain = "";
 for ( $j = 0 ; $j < strlen ( $texts [ $i ] ) ; $j++  )
 {
 $c = $texts [ $i ] [ $j ];
 switch ( $c )
 {
 case "<" :
 $hex = "";
 $isHex = true;
 break;
 case ">" :
 $hexs = str_split ( $hex, 4 );
 for ( $k = 0 ; $k < count ( $hexs ) ; $k++  )
 {
 $chex = str_pad ( $hexs [ $k ], 4, "0" );
 if ( isset ( $transformations [ $chex ] ) )
 $chex = $transformations [ $chex ];
 $document .= html_entity_decode ( "&#x" . $chex . ";" );
 }
 $isHex = false;
 break;
 case "(" :
 $plain = "";
 $isPlain = true;
 break;
 case ")" :
 $document .= $plain;
 $isPlain = false;
 break;
 case "\\" :
 $c2 = $texts [ $i ] [ $j + 1 ];
 if ( in_array ( $c2, array (
 "\\",
 "(",
 ")"
 ) ) )
 $plain .= $c2;
 elseif ( $c2 == "n" )
 $plain .= '\n';
 elseif ( $c2 == "r" )
 $plain .= '\r';
 elseif ( $c2 == "t" )
 $plain .= '\t';
 elseif ( $c2 == "b" )
 $plain .= '\b';
 elseif ( $c2 == "f" )
 $plain .= '\f';
 elseif ( $c2 >= '0' && $c2 <= '9' )
 {
 $oct = preg_replace ( "#[^0-9]#", "", substr ( $texts [ $i ], $j + 1, 3 ) );
 $j += strlen ( $oct ) - 1;
 $plain .= html_entity_decode ( "&#" . octdec ( $oct ) . ";" );
 }
 $j++ ;
 break;
 
 default :
 if ( $isHex )
 $hex .= $c;
 if ( $isPlain )
 $plain .= $c;
 break;
 }
 }
 $document .= "\n";
 }
 
 return $document;
 
 
 }
 
 
 function pdf2text ( $filename )
 {
 $infile = @file_get_contents ( $filename, FILE_BINARY );
 if ( empty ( $infile ) )
 return "";
 
 $transformations = array ();
 $texts = array ();
 
 preg_match_all ( "#obj(.*)endobj#ismU", $infile, $objects );
 $objects = @$objects [ 1 ];
 
 for ( $i = 0 ; $i < count ( $objects ) ; $i++  )
 {
 $currentObject = $objects [ $i ];
 
 if ( preg_match ( "#stream(.*)endstream#ismU", $currentObject, $stream ) )
 {
 $stream = ltrim ( $stream [ 1 ] );
 
 $options = getObjectOptions ( $currentObject );
 if (  !( empty ( $options [ "Length1" ] ) && empty ( $options [ "Type" ] ) && empty ( $options [ "Subtype" ] ) ) )
 continue;
 
 $data = getDecodedStream ( $stream, $options );
 
 if ( strlen ( $data ) )
 {
 // if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) { // mauvais découpage sur Tel CRLF : CRLF 0143507794 CRLF
 //var_dump($data);
 if ( preg_match_all ( "# /p <</MCID [0-9]{1,2}>> BDC (.*) EMC #ismU", $data, $textContainers ) )
 { // Découpage en lignes
 // var_dump($textContainers);
 $textContainers = preg_replace ( "#BT(.*)ET#ismU", "$1", $textContainers [ 1 ] ); // Suppression des BT ... ET
 for ( $i = 0 ; $i < count ( $textContainers ) ; $i++  )
 {
 if ( ( $pos = strrpos ( $textContainers [ $i ], "TJ" ) ) !== false )
 {
 $search_length = strlen ( "TJ" );
 $textContainers [ $i ] = substr_replace ( $textContainers [ $i ], "WX", $pos, $search_length );
 }
 }
 $textContainers = preg_replace ( "#(.*)TJ(.*)#ismU", "$1 $2", $textContainers ); // Suppression des ... TJ ...
 for ( $i = 0 ; $i < count ( $textContainers ) ; $i++  )
 {
 if ( ( $pos = strrpos ( $textContainers [ $i ], "WX" ) ) !== false )
 {
 $search_length = strlen ( "WX" );
 $textContainers [ $i ] = substr_replace ( $textContainers [ $i ], "TJ", $pos, $search_length );
 }
 }
 // $textContainers = @$textContainers[1];
 // var_dump($textContainers);
 getDirtyTexts ( $texts, $textContainers );
 }
 else
 getCharTransformations ( $transformations, $data );
 print_r ( $transformations ) ;
 }
 }
 }
 
 return getTextUsingTransformations ( $texts, $transformations );
 
 
 }
 ?>
 |