function docx2text($filename) {
return readZippedXML($filename, "word/document.xml");
}
function readZippedXML($archiveFile, $dataFile) {
// Create new ZIP archive
$zip = new ZipArchive;
// Open received archive file
if (true === $zip->open($archiveFile)) {
// If done, search for the data file in the archive
if (($index = $zip->locateName($dataFile)) !== false) {
// If found, read it to the string
$data = $zip->getFromIndex($index);
// Close archive file
$zip->close();
// Load XML from a string
// Skip errors and warnings
$xml = DOMDocument::loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
// Return data without XML formatting tags
return strip_tags($xml->saveXML());
}
$zip->close();
}
// In case of failure return empty string
return "";
}
echo docx2text("test.docx");
Thursday, April 7, 2011
Convert DOCX to TEXT with PHP
First, you have to use PHP 5.2+ and enable the ZIP (ZipArchive) extension for PHP.
Subscribe to:
Post Comments (Atom)
Its working. Thanks You so much :-)
ReplyDeleteThank you!! This is so cool. I took the freedom to alter your code to convert pptx to text with PHP:
ReplyDeletefunction pptx2text($filename) {
$zip = new ZipArchive;
// Open received archive file
if (true === $zip->open($filename)) {
// If done, search for the data file in the archive
$dia=1;
$data=array();
while (($index=$zip->locateName("ppt/slides/slide$dia.xml"))!==false)
{
$data[]= $zip->getFromIndex($index);
$dia++;
}
$zip->close();
$output="";
for ($i=0; $i", " ", $data[$i]);
$xml = DOMDocument::loadXML($data[$i], LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
$output.=$xml->saveXML();
}
return strip_tags($output);
}
// In case of failure return empty string
return "";
}
Anonymous, I found some issue in your modified pptx2text script, I made few changes and now it works for me. Suit for yourself. Regards, Pavel:
ReplyDelete/////// PARSE PPTX ///////
function pptx2text($filename) {
$zip = new ZipArchive;
// Open received archive file
if (true === $zip->open($filename)) {
// If done, search for the data file in the archive
$dia=1;
$data=array();
$output="";
while ( ($index=$zip->locateName("ppt/slides/slide$dia.xml") ) !==false)
{
$data[$dia]= $zip->getFromIndex($index);
//echo $data[$dia];
//$xml = DOMDocument::loadXML($data[$i], LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
$xml = str_replace(""," ",$data[$dia]);
//echo $xml;
$output.=$xml;
$dia++;
}
$zip->close();
return strip_tags($output);
//return ($output);
} else {
// In case of failure return empty string
return "";
}
}
echo pptx2text("status.pptx");