function docx2text($filename) { return readZippedXML($filename, "word/document.xml"); } function readZippedXML($archiveFile, $dataFile) { // Create new ZIP archive $zip = new ZipArchive; // Open received archive file if (true === $zip->open($archiveFile)) { // If done, search for the data file in the archive if (($index = $zip->locateName($dataFile)) !== false) { // If found, read it to the string $data = $zip->getFromIndex($index); // Close archive file $zip->close(); // Load XML from a string // Skip errors and warnings $xml = DOMDocument::loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); // Return data without XML formatting tags return strip_tags($xml->saveXML()); } $zip->close(); } // In case of failure return empty string return ""; } echo docx2text("test.docx");
Thursday, April 7, 2011
Convert DOCX to TEXT with PHP
First, you have to use PHP 5.2+ and enable the ZIP (ZipArchive) extension for PHP.
Subscribe to:
Post Comments (Atom)
Its working. Thanks You so much :-)
ReplyDeleteThank you!! This is so cool. I took the freedom to alter your code to convert pptx to text with PHP:
ReplyDeletefunction pptx2text($filename) {
$zip = new ZipArchive;
// Open received archive file
if (true === $zip->open($filename)) {
// If done, search for the data file in the archive
$dia=1;
$data=array();
while (($index=$zip->locateName("ppt/slides/slide$dia.xml"))!==false)
{
$data[]= $zip->getFromIndex($index);
$dia++;
}
$zip->close();
$output="";
for ($i=0; $i", " ", $data[$i]);
$xml = DOMDocument::loadXML($data[$i], LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
$output.=$xml->saveXML();
}
return strip_tags($output);
}
// In case of failure return empty string
return "";
}
Anonymous, I found some issue in your modified pptx2text script, I made few changes and now it works for me. Suit for yourself. Regards, Pavel:
ReplyDelete/////// PARSE PPTX ///////
function pptx2text($filename) {
$zip = new ZipArchive;
// Open received archive file
if (true === $zip->open($filename)) {
// If done, search for the data file in the archive
$dia=1;
$data=array();
$output="";
while ( ($index=$zip->locateName("ppt/slides/slide$dia.xml") ) !==false)
{
$data[$dia]= $zip->getFromIndex($index);
//echo $data[$dia];
//$xml = DOMDocument::loadXML($data[$i], LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
$xml = str_replace(""," ",$data[$dia]);
//echo $xml;
$output.=$xml;
$dia++;
}
$zip->close();
return strip_tags($output);
//return ($output);
} else {
// In case of failure return empty string
return "";
}
}
echo pptx2text("status.pptx");