[289] | 1 | <?php |
---|
| 2 | /** |
---|
| 3 | * PHPExcel |
---|
| 4 | * |
---|
| 5 | * Copyright (c) 2006 - 2014 PHPExcel |
---|
| 6 | * |
---|
| 7 | * This library is free software; you can redistribute it and/or |
---|
| 8 | * modify it under the terms of the GNU Lesser General Public |
---|
| 9 | * License as published by the Free Software Foundation; either |
---|
| 10 | * version 2.1 of the License, or (at your option) any later version. |
---|
| 11 | * |
---|
| 12 | * This library is distributed in the hope that it will be useful, |
---|
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
| 15 | * Lesser General Public License for more details. |
---|
| 16 | * |
---|
| 17 | * You should have received a copy of the GNU Lesser General Public |
---|
| 18 | * License along with this library; if not, write to the Free Software |
---|
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
---|
| 20 | * |
---|
| 21 | * @category PHPExcel |
---|
| 22 | * @package PHPExcel_Reader |
---|
| 23 | * @copyright Copyright (c) 2006 - 2014 PHPExcel (http://www.codeplex.com/PHPExcel) |
---|
| 24 | * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt LGPL |
---|
| 25 | * @version 1.8.0, 2014-03-02 |
---|
| 26 | */ |
---|
| 27 | |
---|
| 28 | |
---|
| 29 | /** PHPExcel root directory */ |
---|
| 30 | if (!defined('PHPEXCEL_ROOT')) { |
---|
| 31 | /** |
---|
| 32 | * @ignore |
---|
| 33 | */ |
---|
| 34 | define('PHPEXCEL_ROOT', dirname(__FILE__) . '/../../'); |
---|
| 35 | require(PHPEXCEL_ROOT . 'PHPExcel/Autoloader.php'); |
---|
| 36 | } |
---|
| 37 | |
---|
| 38 | /** |
---|
| 39 | * PHPExcel_Reader_HTML |
---|
| 40 | * |
---|
| 41 | * @category PHPExcel |
---|
| 42 | * @package PHPExcel_Reader |
---|
| 43 | * @copyright Copyright (c) 2006 - 2014 PHPExcel (http://www.codeplex.com/PHPExcel) |
---|
| 44 | */ |
---|
| 45 | class PHPExcel_Reader_HTML extends PHPExcel_Reader_Abstract implements PHPExcel_Reader_IReader |
---|
| 46 | { |
---|
| 47 | /** |
---|
| 48 | * Input encoding |
---|
| 49 | * |
---|
| 50 | * @var string |
---|
| 51 | */ |
---|
| 52 | private $_inputEncoding = 'ANSI'; |
---|
| 53 | |
---|
| 54 | /** |
---|
| 55 | * Sheet index to read |
---|
| 56 | * |
---|
| 57 | * @var int |
---|
| 58 | */ |
---|
| 59 | private $_sheetIndex = 0; |
---|
| 60 | |
---|
| 61 | /** |
---|
| 62 | * Formats |
---|
| 63 | * |
---|
| 64 | * @var array |
---|
| 65 | */ |
---|
| 66 | private $_formats = array( 'h1' => array( 'font' => array( 'bold' => true, |
---|
| 67 | 'size' => 24, |
---|
| 68 | ), |
---|
| 69 | ), // Bold, 24pt |
---|
| 70 | 'h2' => array( 'font' => array( 'bold' => true, |
---|
| 71 | 'size' => 18, |
---|
| 72 | ), |
---|
| 73 | ), // Bold, 18pt |
---|
| 74 | 'h3' => array( 'font' => array( 'bold' => true, |
---|
| 75 | 'size' => 13.5, |
---|
| 76 | ), |
---|
| 77 | ), // Bold, 13.5pt |
---|
| 78 | 'h4' => array( 'font' => array( 'bold' => true, |
---|
| 79 | 'size' => 12, |
---|
| 80 | ), |
---|
| 81 | ), // Bold, 12pt |
---|
| 82 | 'h5' => array( 'font' => array( 'bold' => true, |
---|
| 83 | 'size' => 10, |
---|
| 84 | ), |
---|
| 85 | ), // Bold, 10pt |
---|
| 86 | 'h6' => array( 'font' => array( 'bold' => true, |
---|
| 87 | 'size' => 7.5, |
---|
| 88 | ), |
---|
| 89 | ), // Bold, 7.5pt |
---|
| 90 | 'a' => array( 'font' => array( 'underline' => true, |
---|
| 91 | 'color' => array( 'argb' => PHPExcel_Style_Color::COLOR_BLUE, |
---|
| 92 | ), |
---|
| 93 | ), |
---|
| 94 | ), // Blue underlined |
---|
| 95 | 'hr' => array( 'borders' => array( 'bottom' => array( 'style' => PHPExcel_Style_Border::BORDER_THIN, |
---|
| 96 | 'color' => array( PHPExcel_Style_Color::COLOR_BLACK, |
---|
| 97 | ), |
---|
| 98 | ), |
---|
| 99 | ), |
---|
| 100 | ), // Bottom border |
---|
| 101 | ); |
---|
| 102 | |
---|
| 103 | |
---|
| 104 | /** |
---|
| 105 | * Create a new PHPExcel_Reader_HTML |
---|
| 106 | */ |
---|
| 107 | public function __construct() { |
---|
| 108 | $this->_readFilter = new PHPExcel_Reader_DefaultReadFilter(); |
---|
| 109 | } |
---|
| 110 | |
---|
| 111 | /** |
---|
| 112 | * Validate that the current file is an HTML file |
---|
| 113 | * |
---|
| 114 | * @return boolean |
---|
| 115 | */ |
---|
| 116 | protected function _isValidFormat() |
---|
| 117 | { |
---|
| 118 | // Reading 2048 bytes should be enough to validate that the format is HTML |
---|
| 119 | $data = fread($this->_fileHandle, 2048); |
---|
| 120 | if ((strpos($data, '<') !== FALSE) && |
---|
| 121 | (strlen($data) !== strlen(strip_tags($data)))) { |
---|
| 122 | return TRUE; |
---|
| 123 | } |
---|
| 124 | |
---|
| 125 | return FALSE; |
---|
| 126 | } |
---|
| 127 | |
---|
| 128 | /** |
---|
| 129 | * Loads PHPExcel from file |
---|
| 130 | * |
---|
| 131 | * @param string $pFilename |
---|
| 132 | * @return PHPExcel |
---|
| 133 | * @throws PHPExcel_Reader_Exception |
---|
| 134 | */ |
---|
| 135 | public function load($pFilename) |
---|
| 136 | { |
---|
| 137 | // Create new PHPExcel |
---|
| 138 | $objPHPExcel = new PHPExcel(); |
---|
| 139 | |
---|
| 140 | // Load into this instance |
---|
| 141 | return $this->loadIntoExisting($pFilename, $objPHPExcel); |
---|
| 142 | } |
---|
| 143 | |
---|
| 144 | /** |
---|
| 145 | * Set input encoding |
---|
| 146 | * |
---|
| 147 | * @param string $pValue Input encoding |
---|
| 148 | */ |
---|
| 149 | public function setInputEncoding($pValue = 'ANSI') |
---|
| 150 | { |
---|
| 151 | $this->_inputEncoding = $pValue; |
---|
| 152 | return $this; |
---|
| 153 | } |
---|
| 154 | |
---|
| 155 | /** |
---|
| 156 | * Get input encoding |
---|
| 157 | * |
---|
| 158 | * @return string |
---|
| 159 | */ |
---|
| 160 | public function getInputEncoding() |
---|
| 161 | { |
---|
| 162 | return $this->_inputEncoding; |
---|
| 163 | } |
---|
| 164 | |
---|
| 165 | // Data Array used for testing only, should write to PHPExcel object on completion of tests |
---|
| 166 | private $_dataArray = array(); |
---|
| 167 | |
---|
| 168 | private $_tableLevel = 0; |
---|
| 169 | private $_nestedColumn = array('A'); |
---|
| 170 | |
---|
| 171 | private function _setTableStartColumn($column) { |
---|
| 172 | if ($this->_tableLevel == 0) |
---|
| 173 | $column = 'A'; |
---|
| 174 | ++$this->_tableLevel; |
---|
| 175 | $this->_nestedColumn[$this->_tableLevel] = $column; |
---|
| 176 | |
---|
| 177 | return $this->_nestedColumn[$this->_tableLevel]; |
---|
| 178 | } |
---|
| 179 | |
---|
| 180 | private function _getTableStartColumn() { |
---|
| 181 | return $this->_nestedColumn[$this->_tableLevel]; |
---|
| 182 | } |
---|
| 183 | |
---|
| 184 | private function _releaseTableStartColumn() { |
---|
| 185 | --$this->_tableLevel; |
---|
| 186 | return array_pop($this->_nestedColumn); |
---|
| 187 | } |
---|
| 188 | |
---|
| 189 | private function _flushCell($sheet,$column,$row,&$cellContent) { |
---|
| 190 | if (is_string($cellContent)) { |
---|
| 191 | // Simple String content |
---|
| 192 | if (trim($cellContent) > '') { |
---|
| 193 | // Only actually write it if there's content in the string |
---|
| 194 | // echo 'FLUSH CELL: ' , $column , $row , ' => ' , $cellContent , '<br />'; |
---|
| 195 | // Write to worksheet to be done here... |
---|
| 196 | // ... we return the cell so we can mess about with styles more easily |
---|
| 197 | $cell = $sheet->setCellValue($column.$row,$cellContent,true); |
---|
| 198 | $this->_dataArray[$row][$column] = $cellContent; |
---|
| 199 | } |
---|
| 200 | } else { |
---|
| 201 | // We have a Rich Text run |
---|
| 202 | // TODO |
---|
| 203 | $this->_dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent; |
---|
| 204 | } |
---|
| 205 | $cellContent = (string) ''; |
---|
| 206 | } |
---|
| 207 | |
---|
| 208 | private function _processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent){ |
---|
| 209 | foreach($element->childNodes as $child){ |
---|
| 210 | if ($child instanceof DOMText) { |
---|
| 211 | $domText = preg_replace('/\s+/',' ',trim($child->nodeValue)); |
---|
| 212 | if (is_string($cellContent)) { |
---|
| 213 | // simply append the text if the cell content is a plain text string |
---|
| 214 | $cellContent .= $domText; |
---|
| 215 | } else { |
---|
| 216 | // but if we have a rich text run instead, we need to append it correctly |
---|
| 217 | // TODO |
---|
| 218 | } |
---|
| 219 | } elseif($child instanceof DOMElement) { |
---|
| 220 | // echo '<b>DOM ELEMENT: </b>' , strtoupper($child->nodeName) , '<br />'; |
---|
| 221 | |
---|
| 222 | $attributeArray = array(); |
---|
| 223 | foreach($child->attributes as $attribute) { |
---|
| 224 | // echo '<b>ATTRIBUTE: </b>' , $attribute->name , ' => ' , $attribute->value , '<br />'; |
---|
| 225 | $attributeArray[$attribute->name] = $attribute->value; |
---|
| 226 | } |
---|
| 227 | |
---|
| 228 | switch($child->nodeName) { |
---|
| 229 | case 'meta' : |
---|
| 230 | foreach($attributeArray as $attributeName => $attributeValue) { |
---|
| 231 | switch($attributeName) { |
---|
| 232 | case 'content': |
---|
| 233 | // TODO |
---|
| 234 | // Extract character set, so we can convert to UTF-8 if required |
---|
| 235 | break; |
---|
| 236 | } |
---|
| 237 | } |
---|
| 238 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 239 | break; |
---|
| 240 | case 'title' : |
---|
| 241 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 242 | $sheet->setTitle($cellContent); |
---|
| 243 | $cellContent = ''; |
---|
| 244 | break; |
---|
| 245 | case 'span' : |
---|
| 246 | case 'div' : |
---|
| 247 | case 'font' : |
---|
| 248 | case 'i' : |
---|
| 249 | case 'em' : |
---|
| 250 | case 'strong': |
---|
| 251 | case 'b' : |
---|
| 252 | // echo 'STYLING, SPAN OR DIV<br />'; |
---|
| 253 | if ($cellContent > '') |
---|
| 254 | $cellContent .= ' '; |
---|
| 255 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 256 | if ($cellContent > '') |
---|
| 257 | $cellContent .= ' '; |
---|
| 258 | // echo 'END OF STYLING, SPAN OR DIV<br />'; |
---|
| 259 | break; |
---|
| 260 | case 'hr' : |
---|
| 261 | $this->_flushCell($sheet,$column,$row,$cellContent); |
---|
| 262 | ++$row; |
---|
| 263 | if (isset($this->_formats[$child->nodeName])) { |
---|
| 264 | $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]); |
---|
| 265 | } else { |
---|
| 266 | $cellContent = '----------'; |
---|
| 267 | $this->_flushCell($sheet,$column,$row,$cellContent); |
---|
| 268 | } |
---|
| 269 | ++$row; |
---|
| 270 | case 'br' : |
---|
| 271 | if ($this->_tableLevel > 0) { |
---|
| 272 | // If we're inside a table, replace with a \n |
---|
| 273 | $cellContent .= "\n"; |
---|
| 274 | } else { |
---|
| 275 | // Otherwise flush our existing content and move the row cursor on |
---|
| 276 | $this->_flushCell($sheet,$column,$row,$cellContent); |
---|
| 277 | ++$row; |
---|
| 278 | } |
---|
| 279 | // echo 'HARD LINE BREAK: ' , '<br />'; |
---|
| 280 | break; |
---|
| 281 | case 'a' : |
---|
| 282 | // echo 'START OF HYPERLINK: ' , '<br />'; |
---|
| 283 | foreach($attributeArray as $attributeName => $attributeValue) { |
---|
| 284 | switch($attributeName) { |
---|
| 285 | case 'href': |
---|
| 286 | // echo 'Link to ' , $attributeValue , '<br />'; |
---|
| 287 | $sheet->getCell($column.$row)->getHyperlink()->setUrl($attributeValue); |
---|
| 288 | if (isset($this->_formats[$child->nodeName])) { |
---|
| 289 | $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]); |
---|
| 290 | } |
---|
| 291 | break; |
---|
| 292 | } |
---|
| 293 | } |
---|
| 294 | $cellContent .= ' '; |
---|
| 295 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 296 | // echo 'END OF HYPERLINK:' , '<br />'; |
---|
| 297 | break; |
---|
| 298 | case 'h1' : |
---|
| 299 | case 'h2' : |
---|
| 300 | case 'h3' : |
---|
| 301 | case 'h4' : |
---|
| 302 | case 'h5' : |
---|
| 303 | case 'h6' : |
---|
| 304 | case 'ol' : |
---|
| 305 | case 'ul' : |
---|
| 306 | case 'p' : |
---|
| 307 | if ($this->_tableLevel > 0) { |
---|
| 308 | // If we're inside a table, replace with a \n |
---|
| 309 | $cellContent .= "\n"; |
---|
| 310 | // echo 'LIST ENTRY: ' , '<br />'; |
---|
| 311 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 312 | // echo 'END OF LIST ENTRY:' , '<br />'; |
---|
| 313 | } else { |
---|
| 314 | if ($cellContent > '') { |
---|
| 315 | $this->_flushCell($sheet,$column,$row,$cellContent); |
---|
| 316 | $row += 2; |
---|
| 317 | } |
---|
| 318 | // echo 'START OF PARAGRAPH: ' , '<br />'; |
---|
| 319 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 320 | // echo 'END OF PARAGRAPH:' , '<br />'; |
---|
| 321 | $this->_flushCell($sheet,$column,$row,$cellContent); |
---|
| 322 | |
---|
| 323 | if (isset($this->_formats[$child->nodeName])) { |
---|
| 324 | $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]); |
---|
| 325 | } |
---|
| 326 | |
---|
| 327 | $row += 2; |
---|
| 328 | $column = 'A'; |
---|
| 329 | } |
---|
| 330 | break; |
---|
| 331 | case 'li' : |
---|
| 332 | if ($this->_tableLevel > 0) { |
---|
| 333 | // If we're inside a table, replace with a \n |
---|
| 334 | $cellContent .= "\n"; |
---|
| 335 | // echo 'LIST ENTRY: ' , '<br />'; |
---|
| 336 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 337 | // echo 'END OF LIST ENTRY:' , '<br />'; |
---|
| 338 | } else { |
---|
| 339 | if ($cellContent > '') { |
---|
| 340 | $this->_flushCell($sheet,$column,$row,$cellContent); |
---|
| 341 | } |
---|
| 342 | ++$row; |
---|
| 343 | // echo 'LIST ENTRY: ' , '<br />'; |
---|
| 344 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 345 | // echo 'END OF LIST ENTRY:' , '<br />'; |
---|
| 346 | $this->_flushCell($sheet,$column,$row,$cellContent); |
---|
| 347 | $column = 'A'; |
---|
| 348 | } |
---|
| 349 | break; |
---|
| 350 | case 'table' : |
---|
| 351 | $this->_flushCell($sheet,$column,$row,$cellContent); |
---|
| 352 | $column = $this->_setTableStartColumn($column); |
---|
| 353 | // echo 'START OF TABLE LEVEL ' , $this->_tableLevel , '<br />'; |
---|
| 354 | if ($this->_tableLevel > 1) |
---|
| 355 | --$row; |
---|
| 356 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 357 | // echo 'END OF TABLE LEVEL ' , $this->_tableLevel , '<br />'; |
---|
| 358 | $column = $this->_releaseTableStartColumn(); |
---|
| 359 | if ($this->_tableLevel > 1) { |
---|
| 360 | ++$column; |
---|
| 361 | } else { |
---|
| 362 | ++$row; |
---|
| 363 | } |
---|
| 364 | break; |
---|
| 365 | case 'thead' : |
---|
| 366 | case 'tbody' : |
---|
| 367 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 368 | break; |
---|
| 369 | case 'tr' : |
---|
| 370 | ++$row; |
---|
| 371 | $column = $this->_getTableStartColumn(); |
---|
| 372 | $cellContent = ''; |
---|
| 373 | // echo 'START OF TABLE ' , $this->_tableLevel , ' ROW<br />'; |
---|
| 374 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 375 | // echo 'END OF TABLE ' , $this->_tableLevel , ' ROW<br />'; |
---|
| 376 | break; |
---|
| 377 | case 'th' : |
---|
| 378 | case 'td' : |
---|
| 379 | // echo 'START OF TABLE ' , $this->_tableLevel , ' CELL<br />'; |
---|
| 380 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 381 | // echo 'END OF TABLE ' , $this->_tableLevel , ' CELL<br />'; |
---|
| 382 | $this->_flushCell($sheet,$column,$row,$cellContent); |
---|
| 383 | ++$column; |
---|
| 384 | break; |
---|
| 385 | case 'body' : |
---|
| 386 | $row = 1; |
---|
| 387 | $column = 'A'; |
---|
| 388 | $content = ''; |
---|
| 389 | $this->_tableLevel = 0; |
---|
| 390 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 391 | break; |
---|
| 392 | default: |
---|
| 393 | $this->_processDomElement($child,$sheet,$row,$column,$cellContent); |
---|
| 394 | } |
---|
| 395 | } |
---|
| 396 | } |
---|
| 397 | } |
---|
| 398 | |
---|
| 399 | /** |
---|
| 400 | * Loads PHPExcel from file into PHPExcel instance |
---|
| 401 | * |
---|
| 402 | * @param string $pFilename |
---|
| 403 | * @param PHPExcel $objPHPExcel |
---|
| 404 | * @return PHPExcel |
---|
| 405 | * @throws PHPExcel_Reader_Exception |
---|
| 406 | */ |
---|
| 407 | public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel) |
---|
| 408 | { |
---|
| 409 | // Open file to validate |
---|
| 410 | $this->_openFile($pFilename); |
---|
| 411 | if (!$this->_isValidFormat()) { |
---|
| 412 | fclose ($this->_fileHandle); |
---|
| 413 | throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file."); |
---|
| 414 | } |
---|
| 415 | // Close after validating |
---|
| 416 | fclose ($this->_fileHandle); |
---|
| 417 | |
---|
| 418 | // Create new PHPExcel |
---|
| 419 | while ($objPHPExcel->getSheetCount() <= $this->_sheetIndex) { |
---|
| 420 | $objPHPExcel->createSheet(); |
---|
| 421 | } |
---|
| 422 | $objPHPExcel->setActiveSheetIndex( $this->_sheetIndex ); |
---|
| 423 | |
---|
| 424 | // Create a new DOM object |
---|
| 425 | $dom = new domDocument; |
---|
| 426 | // Reload the HTML file into the DOM object |
---|
| 427 | $loaded = $dom->loadHTMLFile($pFilename, PHPExcel_Settings::getLibXmlLoaderOptions()); |
---|
| 428 | if ($loaded === FALSE) { |
---|
| 429 | throw new PHPExcel_Reader_Exception('Failed to load ',$pFilename,' as a DOM Document'); |
---|
| 430 | } |
---|
| 431 | |
---|
| 432 | // Discard white space |
---|
| 433 | $dom->preserveWhiteSpace = false; |
---|
| 434 | |
---|
| 435 | |
---|
| 436 | $row = 0; |
---|
| 437 | $column = 'A'; |
---|
| 438 | $content = ''; |
---|
| 439 | $this->_processDomElement($dom,$objPHPExcel->getActiveSheet(),$row,$column,$content); |
---|
| 440 | |
---|
| 441 | // echo '<hr />'; |
---|
| 442 | // var_dump($this->_dataArray); |
---|
| 443 | |
---|
| 444 | // Return |
---|
| 445 | return $objPHPExcel; |
---|
| 446 | } |
---|
| 447 | |
---|
| 448 | /** |
---|
| 449 | * Get sheet index |
---|
| 450 | * |
---|
| 451 | * @return int |
---|
| 452 | */ |
---|
| 453 | public function getSheetIndex() { |
---|
| 454 | return $this->_sheetIndex; |
---|
| 455 | } |
---|
| 456 | |
---|
| 457 | /** |
---|
| 458 | * Set sheet index |
---|
| 459 | * |
---|
| 460 | * @param int $pValue Sheet index |
---|
| 461 | * @return PHPExcel_Reader_HTML |
---|
| 462 | */ |
---|
| 463 | public function setSheetIndex($pValue = 0) { |
---|
| 464 | $this->_sheetIndex = $pValue; |
---|
| 465 | return $this; |
---|
| 466 | } |
---|
| 467 | |
---|
| 468 | } |
---|