Read 354 times | Created 2014-05-20 17:10:07 | Updated 2014-05-21 07:53:55 | | |

 

<?php
/*********************************
FILENAME  : doctotxt.class.php
PURPOSE   : Convert document to text
WRITE DATE : 2014-05-21
**********************************/
class DocxConversion{
  private $filename;

  public function __construct($filePath) {
    $this->filename = $filePath;
  }
/************************ doc file ************************************/
  private function read_doc() {
    $fileHandle = fopen($this->filename, "r");
    $line = @fread($fileHandle, filesize($this->filename));   
    $lines = explode(chr(0x0D),$line);
    $outtext = "";
    foreach($lines as $thisline)
      {
        $pos = strpos($thisline, chr(0x00));
        if (($pos !== FALSE)||(strlen($thisline)==0))
          {
          } else {
            $outtext .= $thisline." ";
          }
      }
     $outtext = preg_replace("/[^a-zA-Z0-9s,.-nrt@/_()]/","",$outtext);
    return $outtext;
  }
/************************ docx file ***********************************/
  private function read_docx(){
    $striped_content = '';
    $content = '';
    $zip = zip_open($this->filename);
    if (!$zip || is_numeric($zip)) return false;
    while ($zip_entry = zip_read($zip)) {
      if (zip_entry_open($zip, $zip_entry) == FALSE) continue;
      if (zip_entry_name($zip_entry) != "word/document.xml") continue;
      $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
      zip_entry_close($zip_entry);
    }// end while
    zip_close($zip);
    $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
    $content = str_replace('</w:r></w:p>', "rn", $content);
    $striped_content = strip_tags($content);
    return $striped_content;
  }
/************************ odt file ************************************/
  
  function odt_to_text($input_file){
    $xml_filename = "content.xml"; //content file name
    $zip_handle = new ZipArchive;
    $output_text = "";
    if(true === $zip_handle->open($input_file)){
      if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){
        $xml_datas = $zip_handle->getFromIndex($xml_index);
        $xml_handle = DOMDocument::loadXML($xml_datas
        , LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
        $output_text = strip_tags($xml_handle->saveXML());
      }else{
        $output_text .="";
      }
      $zip_handle->close();
    }else{
    $output_text .="";
    }
    return $output_text;
  }

/************************ rtf file ************************************/
  function rtf_isPlainText($s) {
      $arrfailAt = array("*", "fonttbl", "colortbl", "datastore", "themedata");
      for ($i = 0; $i < count($arrfailAt); $i++)
          if (!empty($s[$arrfailAt[$i]])) return false;
      return true;
  } 
  
  function rtf_to_text($input_file) {
      // Read the data from the input file.
      $text = file_get_contents($input_file);
      if (!strlen($text))
          return "";
      // Create empty stack array.
      $output_text = "";
      $stack = array();
      $j = -1;
      // Read the data character-by- character
      for ($i = 0, $len = strlen($text); $i < $len; $i++) {
          $c = $text[$i]; 
          // Depending on current character select the further actions.
          switch ($c) {
              // the most important key word backslash
              case "\\":
                  // read next character 
                  $nc = $text[$i + 1];
                  // If it is another backslash or nonbreaking space or hyphen,
                  // then the character is plain text and add it to the output stream.
                  if ($nc == '' && rtf_isPlainText($stack[$j])) $output_text .= '';
                  elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $output_text .= ' ';
                  elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $output_text .= '-';
                  // If it is an asterisk mark, add it to the stack.
                  elseif ($nc == '*') $stack[$j]["*"] = true;
                  // If it is a single quote, read next two characters that are the hexadecimal notation
                  // of a character we should add to the output stream.
                  elseif ($nc == "'") {
                      $hex = substr($text, $i + 2, 2);
                      if (rtf_isPlainText($stack[$j]))
                          $output_text .= html_entity_decode("&#".hexdec($hex).";");
                      //Shift the pointer.
                      $i += 2;
                  // Since, we've found the alphabetic character, the next characters are control word
                  // and, possibly, some digit parameter.
                  } elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
                      $word = "";
                      $param = null;
                      // Start reading characters after the backslash.
                      for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
                          $nc = $text[$k];
                          // If the current character is a letter and there were no digits before it,
                          // then we're still reading the control word. If there were digits, we should stop
                          // since we reach the end of the control word.
                          if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
                              if (empty($param))
                                  $word .= $nc;
                              else
                                  break;
                          // If it is a digit, store the parameter.
                          } elseif ($nc >= '0' && $nc <= '9')
                              $param .= $nc;
                          // Since minus sign may occur only before a digit parameter, check whether
                          // $param is empty. Otherwise, we reach the end of the control word.
                          elseif ($nc == '-') {
                              if (empty($param))
                                  $param .= $nc;
                              else
                                  break;
                          } else
                              break;
                      }
                      // Shift the pointer on the number of read characters.
                      $i += $m - 1;
                      // Start analyzing what we've read. We are interested mostly in control words.
                      $toText = "";
                      switch (strtolower($word)) {
                          // If the control word is "u", then its parameter is the decimal notation of the
                          // Unicode character that should be added to the output stream.
                          // We need to check whether the stack contains ucN control word. If it does,
                          // we should remove the N characters from the output stream.
                          case "u":
                              $toText .= html_entity_decode("&#x".dechex($param).";");
                              $ucDelta = @$stack[$j]["uc"];
                              if ($ucDelta > 0)
                                  $i += $ucDelta;
                          break;
                          // Select line feeds, spaces and tabs.
                          case "par": case "page": case "column": case "line": case "lbr":
                              $toText .= "n"; 
                          break;
                          case "emspace": case "enspace": case "qmspace":
                              $toText .= " "; 
                          break;
                          case "tab": $toText .= "t"; break;
                          // Add current date and time instead of corresponding labels.
                          case "chdate": $toText .= date("m.d.Y"); break;
                          case "chdpl": $toText .= date("l, j F Y"); break;
                          case "chdpa": $toText .= date("D, j M Y"); break;
                          case "chtime": $toText .= date("H:i:s"); break;
                          // Replace some reserved characters to their html analogs.
                          case "emdash": $toText .= html_entity_decode("&mdash;"); break;
                          case "endash": $toText .= html_entity_decode("&ndash;"); break;
                          case "bullet": $toText .= html_entity_decode("&#149;"); break;
                          case "lquote": $toText .= html_entity_decode("&lsquo;"); break;
                          case "rquote": $toText .= html_entity_decode("&rsquo;"); break;
                          case "ldblquote": $toText .= html_entity_decode("&laquo;"); break;
                          case "rdblquote": $toText .= html_entity_decode("&raquo;"); break;
                          // Add all other to the control words stack. If a control word
                          // does not include parameters, set &param to true.
                          default:
                              $stack[$j][strtolower($word)] = empty($param) ? true : $param;
                          break;
                      }
                      // Add data to the output stream if required.
                      if (rtf_isPlainText($stack[$j]))
                          $output_text .= $toText;
                  }
                  $i++;
              break;
              // If we read the opening brace {, then new subgroup starts and we add
              // new array stack element and write the data from previous stack element to it.
              case "{":
                  array_push($stack, $stack[$j++]);
              break;
              // If we read the closing brace }, then we reach the end of subgroup and should remove 
              // the last stack element.
              case "}":
                  array_pop($stack);
                  $j--;
              break;
              // Skip 'trash'.
              case '': case 'r': case 'f': case 'n': break;
              // Add other data to the output stream if required.
              default:
                  if (rtf_isPlainText($stack[$j]))
                      $output_text .= $c;
              break;
          }
      }
      // Return result.
      return $output_text;
  }

/************************excel sheet **********************************/
  
  function xlsx_to_text($input_file){
    $xml_filename = "xl/sharedStrings.xml"; //content file name
    $zip_handle = new ZipArchive;
    $output_text = "";
    if(true === $zip_handle->open($input_file)){
      if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){
        $xml_datas = $zip_handle->getFromIndex($xml_index);
        $xml_handle = DOMDocument::loadXML($xml_datas
        , LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
        $output_text = strip_tags($xml_handle->saveXML());
      }else{
        $output_text .="";
      }
      $zip_handle->close();
    }else{
    $output_text .="";
    }
    return $output_text;
  }
  
/***********************power point files*****************************/
  function pptx_to_text($input_file){
    $zip_handle = new ZipArchive;
    $output_text = "";
    if(true === $zip_handle->open($input_file)){
      $slide_number = 1; //loop through slide files
      while(($xml_index = $zip_handle->locateName("ppt/slides/slide"
                          .$slide_number.".xml")) !== false){
        $xml_datas = $zip_handle->getFromIndex($xml_index);
        $xml_handle = DOMDocument::loadXML($xml_datas
        , LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
        $output_text .= strip_tags($xml_handle->saveXML());
        $slide_number++;
      }
      if($slide_number == 1){
        $output_text .="";
      }
      $zip_handle->close();
    }else{
      $output_text .="";
    }
    return $output_text;
  }

/************************ pdf file ************************************/
  function decodeAsciiHex($input) {
    $output = "";
    $isOdd = true;
    $isComment = false;
    for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
      $c = $input[$i];
      if($isComment) {
        if ($c == 'r' || $c == 'n')
          $isComment = false;
        continue;
      }
      switch($c) {
        case '': case 't': case 'r': case 'f': case 'n': case ' ': break;
        case '%': 
          $isComment = true;
        break;
        default:
          $code = hexdec($c);
          if($code === 0 && $c != '0')
            return "";
          if($isOdd)
            $codeHigh = $code;
          else
            $output .= chr($codeHigh * 16 + $code);
          $isOdd = !$isOdd;
        break;
      }
    }
    if($input[$i] != '>')
      return "";
    if($isOdd)
      $output .= chr($codeHigh * 16);
    return $output;
  }

  function decodeAscii85($input) {
    $output = "";
    $isComment = false;
    $ords = array();
    for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
      $c = $input[$i];
      if($isComment) {
        if ($c == 'r' || $c == 'n')
          $isComment = false;
        continue;
      }
      if ($c == '' || $c == 't' || $c == 'r' || $c == 'f' || $c == 'n' || $c == ' ')
        continue;
      if ($c == '%') {
        $isComment = true;
        continue;
      }
      if ($c == 'z' && $state === 0) {
        $output .= str_repeat(chr(0), 4);
        continue;
      }
      if ($c < '!' || $c > 'u')
        return "";
      $code = ord($input[$i]) & 0xff;
      $ords[$state++] = $code - ord('!');
      if ($state == 5) {
        $state = 0;
        for ($sum = 0, $j = 0; $j < 5; $j++)
          $sum = $sum * 85 + $ords[$j];
        for ($j = 3; $j >= 0; $j--)
          $output .= chr($sum >> ($j * 8));
      }
    }
    if ($state === 1)
      return "";
    elseif ($state > 1) {
      for ($i = 0, $sum = 0; $i < $state; $i++)
        $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
      for ($i = 0; $i < $state - 1; $i++)
        $ouput .= chr($sum >> ((3 - $i) * 8));
    }
    return $output;
  }

  function decodeFlate($input) {
    return @gzuncompress($input);
  }

  function getObjectOptions($object) {
    $options = array();
    if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
      $options = explode("/", $options[1]);
      @array_shift($options);
      $o = array();
      for ($j = 0; $j < @count($options); $j++) {
        $options[$j] = preg_replace("#s+#", " ", trim($options[$j]));
        if (strpos($options[$j], " ") !== false) {
          $parts = explode(" ", $options[$j]);
          $o[$parts[0]] = $parts[1];
        } else
          $o[$options[$j]] = true;
      }
      $options = $o;
      unset($o);
    }
    return $options;
  }

  function getDecodedStream($stream, $options) {
    $data = "";
    if (empty($options["Filter"]))
      $data = $stream;
    else {
      $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
      $_stream = substr($stream, 0, $length);
      foreach ($options as $key => $value) {
        if ($key == "ASCIIHexDecode")
          $_stream = decodeAsciiHex($_stream);
        if ($key == "ASCII85Decode")
          $_stream = decodeAscii85($_stream);
        if ($key == "FlateDecode")
          $_stream = decodeFlate($_stream);
      }
      $data = $_stream;
    }
    return $data;
  }

  function getDirtyTexts(&$texts, $textContainers) {
    for ($j = 0; $j < count($textContainers); $j++) {
      if (preg_match_all("#[(.*)]s*TJ#ismU", $textContainers[$j], $parts))
        $texts = array_merge($texts, @$parts[1]);
      elseif(preg_match_all("#Tds*((.*))s*Tj#ismU", $textContainers[$j], $parts))
        $texts = array_merge($texts, @$parts[1]);
    }
  }

  function getCharTransformations(&$transformations, $stream) {
    preg_match_all("#([0-9]+)s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
    preg_match_all("#([0-9]+)s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
    for ($j = 0; $j < count($chars); $j++) {
      $count = $chars[$j][1];
      $current = explode("n", trim($chars[$j][2]));
      for ($k = 0; $k < $count && $k < count($current); $k++) {
        if (preg_match("#<([0-9a-f]{2,4})>s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
          $transformations[str_pad($map[1], 4, "0")] = $map[2];
      }
    }
    for ($j = 0; $j < count($ranges); $j++) {
      $count = $ranges[$j][1];
      $current = explode("n", trim($ranges[$j][2]));
      for ($k = 0; $k < $count && $k < count($current); $k++) {
        if (preg_match("#<([0-9a-f]{4})>s+<([0-9a-f]{4})>s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
          $from = hexdec($map[1]);
          $to = hexdec($map[2]);
          $_from = hexdec($map[3]);
          for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
            $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
        } elseif (preg_match("#<([0-9a-f]{4})>s+<([0-9a-f]{4})>s+[(.*)]#ismU", trim($current[$k]), $map)) {
          $from = hexdec($map[1]);
          $to = hexdec($map[2]);
          $parts = preg_split("#s+#", trim($map[3]));
          for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
            $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
        }
      }
    }
  }
  
  function getTextUsingTransformations($texts, $transformations) {
    $document = "";
    for ($i = 0; $i < count($texts); $i++) {
      $isHex = false;
      $isPlain = false;
      $hex = "";
      $plain = "";
      for ($j = 0; $j < strlen($texts[$i]); $j++) {
        $c = $texts[$i][$j];
        switch($c) {
          case "<":
            $hex = "";
            $isHex = true;
          break;
          case ">":
            $hexs = str_split($hex, 4);
            for ($k = 0; $k < count($hexs); $k++) {
              $chex = str_pad($hexs[$k], 4, "0");
              if (isset($transformations[$chex]))
                $chex = $transformations[$chex];
              $document .= html_entity_decode("&#x".$chex.";");
            }
            $isHex = false;
          break;
          case "(":
            $plain = "";
            $isPlain = true;
          break;
          case ")":
            $document .= $plain;
            $isPlain = false;
          break;
          case "\\":
            $c2 = $texts[$i][$j + 1];
            if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
            elseif ($c2 == "n") $plain .= 'n';
            elseif ($c2 == "r") $plain .= 'r';
            elseif ($c2 == "t") $plain .= 't';
            elseif ($c2 == "b") $plain .= 'b';
            elseif ($c2 == "f") $plain .= 'f';
            elseif ($c2 >= '0' && $c2 <= '9') {
              $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
              $j += strlen($oct) - 1;
              $plain .= html_entity_decode("&#".octdec($oct).";");
            }
            $j++;
          break;
          default:
            if ($isHex)
              $hex .= $c;
            if ($isPlain)
              $plain .= $c;
          break;
        }
      }
      $document .= "n";
    }
    return $document;
  }
  
  function pdf_to_text($input_file) {
    $infile = @file_get_contents($input_file, FILE_BINARY);
    if (empty($infile))
      return "";
    $transformations = array();
    $texts = array();
    preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
    $objects = @$objects[1];
    for ($i = 0; $i < count($objects); $i++) {
      $currentObject = $objects[$i];
      if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
        $stream = ltrim($stream[1]);
        $options = getObjectOptions($currentObject);
        if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
          continue;
        $data = getDecodedStream($stream, $options); 
        if (strlen($data)) {
          if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
            $textContainers = @$textContainers[1];
            getDirtyTexts($texts, $textContainers);
          } else
            getCharTransformations($transformations, $data);
        }
      }
    }
    $output_text=getTextUsingTransformations($texts, $transformations);
    return $output_text;
  }


  public function convertToText() {
    if(isset($this->filename) && !file_exists($this->filename)) {
      return "File Not exists";
    }
    $fileArray = pathinfo($this->filename);
    $file_ext  = $fileArray['extension'];
    $ext=array('doc','docx','odt','rtf','xlsx','pptx','pdf');
    if(in_array($file_ext,$ext))
    {
      if($file_ext == "doc") {
        return $this->read_doc();
      } elseif($file_ext == "docx") {
        return $this->read_docx();
      } elseif($file_ext == "odt") {
        return $this->odt_to_text();
      } elseif($file_ext == "rtf") {
        return $this->rtf_to_text();
      } elseif($file_ext == "xlsx") {
        return $this->xlsx_to_text();
      }elseif($file_ext == "pptx") {
        return $this->pptx_to_text();
      }elseif($file_ext == "pdf") {
        return $this->pdf_to_text();
      }
    } else {
      return "Invalid File Type";
    }
  }
}

//-usage
$docObj = new DocxConversion("test.doc");
//$docObj = new DocxConversion("test.docx");
//$docObj = new DocxConversion("test.odt");
//$docObj = new DocxConversion("test.rtf");
//$docObj = new DocxConversion("test.xlsx");
//$docObj = new DocxConversion("test.pptx");
//$docObj = new DocxConversion("test.pdf");
echo $docText= $docObj->convertToText();