source: pro-violet-viettel/sourcecode/api.violet.vn/www/lib/common/html2text.php @ 289

Last change on this file since 289 was 289, checked in by dungnv, 11 years ago
File size: 14.2 KB
Line 
1<?php
2
3/*************************************************************************
4 *                                                                       *
5 * class.html2text.inc                                                   *
6 *                                                                       *
7 *************************************************************************
8 *                                                                       *
9 * Converts HTML to formatted plain text                                 *
10 *                                                                       *
11 * Copyright (c) 2005 Jon Abernathy <jon@chuggnutt.com>                  *
12 * All rights reserved.                                                  *
13 *                                                                       *
14 * This script is free software; you can redistribute it and/or modify   *
15 * it under the terms of the GNU General Public License as published by  *
16 * the Free Software Foundation; either version 2 of the License, or     *
17 * (at your option) any later version.                                   *
18 *                                                                       *
19 * The GNU General Public License can be found at                        *
20 * http://www.gnu.org/copyleft/gpl.html.                                 *
21 *                                                                       *
22 * This script is distributed in the hope that it will be useful,        *
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          *
25 * GNU General Public License for more details.                          *
26 *                                                                       *
27 * Author(s): Jon Abernathy <jon@chuggnutt.com>                          *
28 *                                                                       *
29 * Last modified: 11/02/06                                               *
30 *                                                                       *
31 *************************************************************************/
32
33
34/**
35 *  Takes HTML and converts it to formatted, plain text.
36 *
37 *  Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and
38 *  correcting an error in the regexp search array. Fixed 7/30/03.
39 *
40 *  Updated set_html() function's file reading mechanism, 9/25/03.
41 *
42 *  Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding
43 *  several more HTML entity codes to the $search and $replace arrays.
44 *  Updated 11/7/03.
45 *
46 *  Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for
47 *  suggesting the addition of $allowed_tags and its supporting function
48 *  (which I slightly modified). Updated 3/12/04.
49 *
50 *  Thanks to Justin Dearing for pointing out that a replacement for the
51 *  <TH> tag was missing, and suggesting an appropriate fix.
52 *  Updated 8/25/04.
53 *
54 *  Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a
55 *  display/formatting bug in the _build_link_list() function: email
56 *  readers would show the left bracket and number ("[1") as part of the
57 *  rendered email address.
58 *  Updated 12/16/04.
59 *
60 *  Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code
61 *  to handle relative links, which I hadn't considered. I modified his
62 *  code a bit to handle normal HTTP links and MAILTO links. Also for
63 *  suggesting three additional HTML entity codes to search for.
64 *  Updated 03/02/05.
65 *
66 *  Thanks to Jacob Chandler for pointing out another link condition
67 *  for the _build_link_list() function: "https".
68 *  Updated 04/06/05.
69 *
70 *  Thanks to Marc Bertrand (http://www.dresdensky.com/) for
71 *  suggesting a revision to the word wrapping functionality; if you
72 *  specify a $width of 0 or less, word wrapping will be ignored.
73 *  Updated 11/02/06.
74 *
75 *  @author Jon Abernathy <jon@chuggnutt.com>
76 *  @version 0.6.2
77 *  @since PHP 4.0.2
78 */
79class html2text
80{
81
82    /**
83     *  Contains the HTML content to convert.
84     *
85     *  @private string $html
86     *  @access public
87     */
88    private $html;
89
90    /**
91     *  Contains the converted, formatted text.
92     *
93     *  @private string $text
94     *  @access public
95     */
96    private $text;
97
98    /**
99     *  Maximum width of the formatted text, in columns.
100     *
101     *  Set this value to 0 (or less) to ignore word wrapping
102     *  and not constrain text to a fixed-width column.
103     *
104     *  @private integer $width
105     *  @access public
106     */
107    private $width = 0;
108
109    /**
110     *  List of preg* regular expression patterns to search for,
111     *  used in conjunction with $replace.
112     *
113     *  @private array $search
114     *  @access public
115     *  @see $replace
116     */
117    private $search = array(
118        "/\r/",                                  // Non-legal carriage return
119        "/[\n\t]+/",                             // Newlines and tabs
120        '/<script[^>]*>.*?<\/script>/i',         // <script>s -- which strip_tags supposedly has problems with
121        //'/<!-- .* -->/',                         // Comments -- which strip_tags might have problem a with
122        '/<h[123][^>]*>(.+?)<\/h[123]>/ie',      // H1 - H3
123        '/<h[456][^>]*>(.+?)<\/h[456]>/ie',      // H4 - H6
124        '/<p[^>]*>/i',                           // <P>
125        '/<br[^>]*>/i',                          // <br>
126        '/<b[^>]*>(.+?)<\/b>/ie',                // <b>
127        '/<i[^>]*>(.+?)<\/i>/i',                 // <i>
128        '/(<ul[^>]*>|<\/ul>)/i',                 // <ul> and </ul>
129        '/(<ol[^>]*>|<\/ol>)/i',                 // <ol> and </ol>
130        '/<li[^>]*>/i',                          // <li>
131        '/<a href="([^"]+)"[^>]*>(.+?)<\/a>/ie', // <a href="">
132        '/<hr[^>]*>/i',                          // <hr>
133        '/(<table[^>]*>|<\/table>)/i',           // <table> and </table>
134        '/(<tr[^>]*>|<\/tr>)/i',                 // <tr> and </tr>
135        '/<td[^>]*>(.+?)<\/td>/i',               // <td> and </td>
136        '/<th[^>]*>(.+?)<\/th>/i',               // <th> and </th>
137        '/&nbsp;/i',
138        '/&quot;/i',
139        '/&gt;/i',
140        '/&lt;/i',
141        '/&amp;/i',
142        '/&copy;/i',
143        '/&trade;/i',
144        '/&#8220;/',
145        '/&#8221;/',
146        '/&#8211;/',
147        '/&#8217;/',
148        '/&#38;/',
149        '/&#169;/',
150        '/&#8482;/',
151        '/&#151;/',
152        '/&#147;/',
153        '/&#148;/',
154        '/&#149;/',
155        '/&reg;/i',
156        '/&bull;/i',
157        '/&[&;]+;/i'
158    );
159
160    /**
161     *  List of pattern replacements corresponding to patterns searched.
162     *
163     *  @private array $replace
164     *  @access public
165     *  @see $search
166     */
167    private $replace = array(
168        '',                                     // Non-legal carriage return
169        ' ',                                    // Newlines and tabs
170        '',                                     // <script>s -- which strip_tags supposedly has problems with
171        //'',                                   // Comments -- which strip_tags might have problem a with
172        "\"\n\n\\1\n\n\"",                          // H1 - H3
173        "ucwords(\"\n\n\\1\n\n\")",             // H4 - H6
174        "\n\n\t",                               // <P>
175        "\n",                                   // <br>
176        '"\\1"',                                  // <b>
177        '_\\1_',                                // <i>
178        "\n\n",                                 // <ul> and </ul>
179        "\n\n",                                 // <ol> and </ol>
180        "\t*",                                  // <li>
181        '',//'$this->_build_link_list($link_count++, "\\1", "\\2")',
182                                                // <a href="">
183        "\n-------------------------\n",        // <hr>
184        "\n\n",                                 // <table> and </table>
185        "\n",                                   // <tr> and </tr>
186        "\t\t\\1\n",                            // <td> and </td>
187        "\"\t\t\\1\n\"",            // <th> and </th>
188        ' ',
189        '"',
190        '>',
191        '<',
192        '&',
193        '(c)',
194        '(tm)',
195        '"',
196        '"',
197        '-',
198        "'",
199        '&',
200        '(c)',
201        '(tm)',
202        '--',
203        '"',
204        '"',
205        '*',
206        '(R)',
207        '*',
208        ''
209    );
210
211    /**
212     *  Contains a list of HTML tags to allow in the resulting text.
213     *
214     *  @private string $allowed_tags
215     *  @access public
216     *  @see set_allowed_tags()
217     */
218    private $allowed_tags = '';
219
220    /**
221     *  Contains the base URL that relative links should resolve to.
222     *
223     *  @private string $url
224     *  @access public
225     */
226    private $url;
227
228    /**
229     *  Indicates whether content in the $html variable has been converted yet.
230     *
231     *  @private boolean $converted
232     *  @access private
233     *  @see $html, $text
234     */
235    private $_converted = false;
236
237    /**
238     *  Contains URL addresses from links to be rendered in plain text.
239     *
240     *  @private string $link_list
241     *  @access private
242     *  @see _build_link_list()
243     */
244    private $_link_list;
245
246    /**
247     *  Constructor.
248     *
249     *  If the HTML source string (or file) is supplied, the class
250     *  will instantiate with that source propagated, all that has
251     *  to be done it to call get_text().
252     *
253     *  @param string $source HTML content
254     *  @param boolean $from_file Indicates $source is a file to pull content from
255     *  @access public
256     *  @return void
257     */
258    function html2text( $source = '', $from_file = false )
259    {
260        if ( !empty($source) ) {
261            $this->set_html($source, $from_file);
262        }
263        $this->set_base_url();
264    }
265
266    /**
267     *  Loads source HTML into memory, either from $source string or a file.
268     *
269     *  @param string $source HTML content
270     *  @param boolean $from_file Indicates $source is a file to pull content from
271     *  @access public
272     *  @return void
273     */
274    function set_html( $source, $from_file = false )
275    {
276        $this->html = $source;
277
278        if ( $from_file && file_exists($source) ) {
279            $fp = fopen($source, 'r');
280            $this->html = fread($fp, filesize($source));
281            fclose($fp);
282        }
283
284        $this->_converted = false;
285    }
286
287    /**
288     *  Returns the text, converted from HTML.
289     *
290     *  @access public
291     *  @return string
292     */
293    function get_text()
294    {
295        if ( !$this->_converted ) {
296            $this->_convert();
297        }
298
299        return $this->text;
300    }
301
302    /**
303     *  Prints the text, converted from HTML.
304     *
305     *  @access public
306     *  @return void
307     */
308    function print_text()
309    {
310        print $this->get_text();
311    }
312
313    /**
314     *  Alias to print_text(), operates identically.
315     *
316     *  @access public
317     *  @return void
318     *  @see print_text()
319     */
320    function p()
321    {
322        print $this->get_text();
323    }
324
325    /**
326     *  Sets the allowed HTML tags to pass through to the resulting text.
327     *
328     *  Tags should be in the form "<p>", with no corresponding closing tag.
329     *
330     *  @access public
331     *  @return void
332     */
333    function set_allowed_tags( $allowed_tags = '' )
334    {
335        if ( !empty($allowed_tags) ) {
336            $this->allowed_tags = $allowed_tags;
337        }
338    }
339
340    /**
341     *  Sets a base URL to handle relative links.
342     *
343     *  @access public
344     *  @return void
345     */
346    function set_base_url( $url = '' )
347    {
348        if ( empty($url) ) {
349            $this->url = 'http://' . $_SERVER['HTTP_HOST'];
350        } else {
351            // Strip any trailing slashes for consistency (relative
352            // URLs may already start with a slash like "/file.html")
353            if ( substr($url, -1) == '/' ) {
354                $url = substr($url, 0, -1);
355            }
356            $this->url = $url;
357        }
358    }
359
360    /**
361     *  Workhorse function that does actual conversion.
362     *
363     *  First performs custom tag replacement specified by $search and
364     *  $replace arrays. Then strips any remaining HTML tags, reduces whitespace
365     *  and newlines to a readable format, and word wraps the text to
366     *  $width characters.
367     *
368     *  @access private
369     *  @return void
370     */
371    function _convert()
372    {
373        // Variables used for building the link list
374        $link_count = 1;
375        $this->_link_list = '';
376
377        $text = trim(stripslashes($this->html));
378
379        // Run our defined search-and-replace
380        $text = preg_replace($this->search, $this->replace, $text);
381
382        // Strip any other HTML tags
383        $text = strip_tags($text, $this->allowed_tags);
384
385        // Bring down number of empty lines to 2 max
386        $text = preg_replace("/\n\s+\n/", "\n", $text);
387        $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
388
389        // Add link list
390        /*
391        if ( !empty($this->_link_list) ) {
392            $text .= "\n\nLinks:\n------\n" . $this->_link_list;
393        }
394        */
395
396        // Wrap the text to a readable format
397        // for PHP versions >= 4.0.2. Default width is 75
398        // If width is 0 or less, don't wrap the text.
399        if ( $this->width > 0 ) {
400                $text = wordwrap($text, $this->width);
401        }
402
403        $this->text = $text;
404
405        $this->_converted = true;
406    }
407
408    /**
409     *  Helper function called by preg_replace() on link replacement.
410     *
411     *  Maintains an internal list of links to be displayed at the end of the
412     *  text, with numeric indices to the original point in the text they
413     *  appeared. Also makes an effort at identifying and handling absolute
414     *  and relative links.
415     *
416     *  @param integer $link_count Counter tracking current link number
417     *  @param string $link URL of the link
418     *  @param string $display Part of the text to associate number with
419     *  @access private
420     *  @return string
421     */
422    function _build_link_list($link_count, $link, $display)
423    {
424        if ( substr($link, 0, 7) == 'http://' || substr($link, 0, 8) == 'https://' ||
425             substr($link, 0, 7) == 'mailto:' ) {
426            $this->_link_list .= "[$link_count] $link\n";
427        } else {
428            $this->_link_list .= "[$link_count] " . $this->url;
429            if ( substr($link, 0, 1) != '/' ) {
430                $this->_link_list .= '/';
431            }
432            $this->_link_list .= "$link\n";
433        }
434
435        return $display . ' [' . $link_count . ']';
436    }
437
438}
439
440?>
Note: See TracBrowser for help on using the repository browser.