class.html2text.php 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696
  1. <?php
  2. /*************************************************************************
  3. * *
  4. * Converts HTML to formatted plain text *
  5. * *
  6. * Portions Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com> *
  7. * This version from https://github.com/mtibben/html2text *
  8. * *
  9. * This script is free software; you can redistribute it and/or modify *
  10. * it under the terms of the GNU General Public License as published by *
  11. * the Free Software Foundation; either version 2 of the License, or *
  12. * (at your option) any later version. *
  13. * *
  14. * The GNU General Public License can be found at *
  15. * http://www.gnu.org/copyleft/gpl.html. *
  16. * *
  17. * This script is distributed in the hope that it will be useful, *
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  20. * GNU General Public License for more details. *
  21. * *
  22. *************************************************************************/
  23. class html2text
  24. {
  25. /**
  26. * Contains the HTML content to convert.
  27. *
  28. * @var string $html
  29. * @access public
  30. */
  31. public $html;
  32. /**
  33. * Contains the converted, formatted text.
  34. *
  35. * @var string $text
  36. * @access public
  37. */
  38. public $text;
  39. /**
  40. * Maximum width of the formatted text, in columns.
  41. *
  42. * Set this value to 0 (or less) to ignore word wrapping
  43. * and not constrain text to a fixed-width column.
  44. *
  45. * @var integer $width
  46. * @access public
  47. */
  48. public $width = 70;
  49. /**
  50. * List of preg* regular expression patterns to search for,
  51. * used in conjunction with $replace.
  52. *
  53. * @var array $search
  54. * @access public
  55. * @see $replace
  56. */
  57. public $search = array(
  58. "/\r/", // Non-legal carriage return
  59. "/[\n\t]+/", // Newlines and tabs
  60. '/<head[^>]*>.*?<\/head>/i', // <head>
  61. '/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with
  62. '/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with
  63. '/<p[^>]*>/i', // <P>
  64. '/<br[^>]*>/i', // <br>
  65. '/<i[^>]*>(.*?)<\/i>/i', // <i>
  66. '/<em[^>]*>(.*?)<\/em>/i', // <em>
  67. '/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul>
  68. '/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol>
  69. '/<li[^>]*>(.*?)<\/li>/i', // <li> and </li>
  70. '/<li[^>]*>/i', // <li>
  71. '/<hr[^>]*>/i', // <hr>
  72. '/<div[^>]*>/i', // <div>
  73. '/(<table[^>]*>|<\/table>)/i', // <table> and </table>
  74. '/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr>
  75. '/<td[^>]*>(.*?)<\/td>/i', // <td> and </td>
  76. '/<span class="_html2text_ignore">.+?<\/span>/i' // <span class="_html2text_ignore">...</span>
  77. );
  78. /**
  79. * List of pattern replacements corresponding to patterns searched.
  80. *
  81. * @var array $replace
  82. * @access public
  83. * @see $search
  84. */
  85. public $replace = array(
  86. '', // Non-legal carriage return
  87. ' ', // Newlines and tabs
  88. '', // <head>
  89. '', // <script>s -- which strip_tags supposedly has problems with
  90. '', // <style>s -- which strip_tags supposedly has problems with
  91. "\n\n", // <P>
  92. "\n", // <br>
  93. '_\\1_', // <i>
  94. '_\\1_', // <em>
  95. "\n\n", // <ul> and </ul>
  96. "\n\n", // <ol> and </ol>
  97. "\t* \\1\n", // <li> and </li>
  98. "\n\t* ", // <li>
  99. "\n-------------------------\n", // <hr>
  100. "<div>\n", // <div>
  101. "\n\n", // <table> and </table>
  102. "\n", // <tr> and </tr>
  103. "\t\t\\1\n", // <td> and </td>
  104. "" // <span class="_html2text_ignore">...</span>
  105. );
  106. /**
  107. * List of preg* regular expression patterns to search for,
  108. * used in conjunction with $ent_replace.
  109. *
  110. * @var array $ent_search
  111. * @access public
  112. * @see $ent_replace
  113. */
  114. public $ent_search = array(
  115. '/&(nbsp|#160);/i', // Non-breaking space
  116. '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i',
  117. // Double quotes
  118. '/&(apos|rsquo|lsquo|#8216|#8217);/i', // Single quotes
  119. '/&gt;/i', // Greater-than
  120. '/&lt;/i', // Less-than
  121. '/&(copy|#169);/i', // Copyright
  122. '/&(trade|#8482|#153);/i', // Trademark
  123. '/&(reg|#174);/i', // Registered
  124. '/&(mdash|#151|#8212);/i', // mdash
  125. '/&(ndash|minus|#8211|#8722);/i', // ndash
  126. '/&(bull|#149|#8226);/i', // Bullet
  127. '/&(pound|#163);/i', // Pound sign
  128. '/&(euro|#8364);/i', // Euro sign
  129. '/&(amp|#38);/i', // Ampersand: see _converter()
  130. '/[ ]{2,}/', // Runs of spaces, post-handling
  131. );
  132. /**
  133. * List of pattern replacements corresponding to patterns searched.
  134. *
  135. * @var array $ent_replace
  136. * @access public
  137. * @see $ent_search
  138. */
  139. public $ent_replace = array(
  140. ' ', // Non-breaking space
  141. '"', // Double quotes
  142. "'", // Single quotes
  143. '>',
  144. '<',
  145. '(c)',
  146. '(tm)',
  147. '(R)',
  148. '--',
  149. '-',
  150. '*',
  151. '£',
  152. 'EUR', // Euro sign. € ?
  153. '|+|amp|+|', // Ampersand: see _converter()
  154. ' ', // Runs of spaces, post-handling
  155. );
  156. /**
  157. * List of preg* regular expression patterns to search for
  158. * and replace using callback function.
  159. *
  160. * @var array $callback_search
  161. * @access public
  162. */
  163. public $callback_search = array(
  164. '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i', // <a href="">
  165. '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6
  166. '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b>
  167. '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong>
  168. '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th>
  169. );
  170. /**
  171. * List of preg* regular expression patterns to search for in PRE body,
  172. * used in conjunction with $pre_replace.
  173. *
  174. * @var array $pre_search
  175. * @access public
  176. * @see $pre_replace
  177. */
  178. public $pre_search = array(
  179. "/\n/",
  180. "/\t/",
  181. '/ /',
  182. '/<pre[^>]*>/',
  183. '/<\/pre>/'
  184. );
  185. /**
  186. * List of pattern replacements corresponding to patterns searched for PRE body.
  187. *
  188. * @var array $pre_replace
  189. * @access public
  190. * @see $pre_search
  191. */
  192. public $pre_replace = array(
  193. '<br>',
  194. '&nbsp;&nbsp;&nbsp;&nbsp;',
  195. '&nbsp;',
  196. '',
  197. ''
  198. );
  199. /**
  200. * Contains a list of HTML tags to allow in the resulting text.
  201. *
  202. * @var string $allowed_tags
  203. * @access public
  204. * @see set_allowed_tags()
  205. */
  206. public $allowed_tags = '';
  207. /**
  208. * Contains the base URL that relative links should resolve to.
  209. *
  210. * @var string $url
  211. * @access public
  212. */
  213. public $url;
  214. /**
  215. * Indicates whether content in the $html variable has been converted yet.
  216. *
  217. * @var boolean $_converted
  218. * @access private
  219. * @see $html, $text
  220. */
  221. private $_converted = false;
  222. /**
  223. * Contains URL addresses from links to be rendered in plain text.
  224. *
  225. * @var array $_link_list
  226. * @access private
  227. * @see _build_link_list()
  228. */
  229. private $_link_list = array();
  230. /**
  231. * Various configuration options (able to be set in the constructor)
  232. *
  233. * @var array $_options
  234. * @access private
  235. */
  236. private $_options = array(
  237. // 'none'
  238. // 'inline' (show links inline)
  239. // 'nextline' (show links on the next line)
  240. // 'table' (if a table of link URLs should be listed after the text.
  241. 'do_links' => 'inline',
  242. // Maximum width of the formatted text, in columns.
  243. // Set this value to 0 (or less) to ignore word wrapping
  244. // and not constrain text to a fixed-width column.
  245. 'width' => 70,
  246. );
  247. /**
  248. * Constructor.
  249. *
  250. * If the HTML source string (or file) is supplied, the class
  251. * will instantiate with that source propagated, all that has
  252. * to be done it to call get_text().
  253. *
  254. * @param string $source HTML content
  255. * @param boolean $from_file Indicates $source is a file to pull content from
  256. * @param array $options Set configuration options
  257. * @access public
  258. * @return void
  259. */
  260. public function __construct( $source = '', $from_file = false, $options = array() )
  261. {
  262. $this->_options = array_merge($this->_options, $options);
  263. if ( !empty($source) ) {
  264. $this->set_html($source, $from_file);
  265. }
  266. $this->set_base_url();
  267. }
  268. /**
  269. * Loads source HTML into memory, either from $source string or a file.
  270. *
  271. * @param string $source HTML content
  272. * @param boolean $from_file Indicates $source is a file to pull content from
  273. * @access public
  274. * @return void
  275. */
  276. public function set_html( $source, $from_file = false )
  277. {
  278. if ( $from_file && file_exists($source) ) {
  279. $this->html = file_get_contents($source);
  280. }
  281. else
  282. $this->html = $source;
  283. $this->_converted = false;
  284. }
  285. /**
  286. * Returns the text, converted from HTML.
  287. *
  288. * @access public
  289. * @return string
  290. */
  291. public function get_text()
  292. {
  293. if ( !$this->_converted ) {
  294. $this->_convert();
  295. }
  296. return $this->text;
  297. }
  298. /**
  299. * Prints the text, converted from HTML.
  300. *
  301. * @access public
  302. * @return void
  303. */
  304. public function print_text()
  305. {
  306. print $this->get_text();
  307. }
  308. /**
  309. * Alias to print_text(), operates identically.
  310. *
  311. * @access public
  312. * @return void
  313. * @see print_text()
  314. */
  315. public function p()
  316. {
  317. print $this->get_text();
  318. }
  319. /**
  320. * Sets the allowed HTML tags to pass through to the resulting text.
  321. *
  322. * Tags should be in the form "<p>", with no corresponding closing tag.
  323. *
  324. * @access public
  325. * @return void
  326. */
  327. public function set_allowed_tags( $allowed_tags = '' )
  328. {
  329. if ( !empty($allowed_tags) ) {
  330. $this->allowed_tags = $allowed_tags;
  331. }
  332. }
  333. /**
  334. * Sets a base URL to handle relative links.
  335. *
  336. * @access public
  337. * @return void
  338. */
  339. public function set_base_url( $url = '' )
  340. {
  341. if ( empty($url) ) {
  342. if ( !empty($_SERVER['HTTP_HOST']) ) {
  343. $this->url = 'http://' . $_SERVER['HTTP_HOST'];
  344. } else {
  345. $this->url = '';
  346. }
  347. } else {
  348. // Strip any trailing slashes for consistency (relative
  349. // URLs may already start with a slash like "/file.html")
  350. if ( substr($url, -1) == '/' ) {
  351. $url = substr($url, 0, -1);
  352. }
  353. $this->url = $url;
  354. }
  355. }
  356. /**
  357. * Workhorse function that does actual conversion (calls _converter() method).
  358. *
  359. * @access private
  360. * @return void
  361. */
  362. private function _convert()
  363. {
  364. // Variables used for building the link list
  365. $this->_link_list = array();
  366. $text = trim(stripslashes($this->html));
  367. // Convert HTML to TXT
  368. $this->_converter($text);
  369. // Add link list
  370. if (!empty($this->_link_list)) {
  371. $text .= "\n\nLinks:\n------\n";
  372. foreach ($this->_link_list as $idx => $url) {
  373. $text .= '[' . ($idx+1) . '] ' . $url . "\n";
  374. }
  375. }
  376. $this->text = $text;
  377. $this->_converted = true;
  378. }
  379. /**
  380. * Workhorse function that does actual conversion.
  381. *
  382. * First performs custom tag replacement specified by $search and
  383. * $replace arrays. Then strips any remaining HTML tags, reduces whitespace
  384. * and newlines to a readable format, and word wraps the text to
  385. * $this->_options['width'] characters.
  386. *
  387. * @param string Reference to HTML content string
  388. *
  389. * @access private
  390. * @return void
  391. */
  392. private function _converter(&$text)
  393. {
  394. // Convert <BLOCKQUOTE> (before PRE!)
  395. $this->_convert_blockquotes($text);
  396. // Convert <PRE>
  397. $this->_convert_pre($text);
  398. // Run our defined tags search-and-replace
  399. $text = preg_replace($this->search, $this->replace, $text);
  400. // Run our defined tags search-and-replace with callback
  401. $text = preg_replace_callback($this->callback_search, array($this, '_preg_callback'), $text);
  402. // Strip any other HTML tags
  403. $text = strip_tags($text, $this->allowed_tags);
  404. // Run our defined entities/characters search-and-replace
  405. $text = preg_replace($this->ent_search, $this->ent_replace, $text);
  406. // Replace known html entities
  407. $text = html_entity_decode($text, ENT_QUOTES);
  408. // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
  409. $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
  410. // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
  411. // This properly handles situation of "&amp;quot;" in input string
  412. $text = str_replace('|+|amp|+|', '&', $text);
  413. // Bring down number of empty lines to 2 max
  414. $text = preg_replace("/\n\s+\n/", "\n\n", $text);
  415. $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
  416. // remove leading empty lines (can be produced by eg. P tag on the beginning)
  417. $text = ltrim($text, "\n");
  418. // Wrap the text to a readable format
  419. // for PHP versions >= 4.0.2. Default width is 75
  420. // If width is 0 or less, don't wrap the text.
  421. if ( $this->_options['width'] > 0 ) {
  422. $text = wordwrap($text, $this->_options['width']);
  423. }
  424. }
  425. /**
  426. * Helper function called by preg_replace() on link replacement.
  427. *
  428. * Maintains an internal list of links to be displayed at the end of the
  429. * text, with numeric indices to the original point in the text they
  430. * appeared. Also makes an effort at identifying and handling absolute
  431. * and relative links.
  432. *
  433. * @param string $link URL of the link
  434. * @param string $display Part of the text to associate number with
  435. * @access private
  436. * @return string
  437. */
  438. private function _build_link_list( $link, $display, $link_override = null)
  439. {
  440. $link_method = ($link_override) ? $link_override : $this->_options['do_links'];
  441. if ($link_method == 'none')
  442. return $display;
  443. // Ignored link types
  444. if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
  445. return $display;
  446. }
  447. if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
  448. $url = $link;
  449. }
  450. else {
  451. $url = $this->url;
  452. if (substr($link, 0, 1) != '/') {
  453. $url .= '/';
  454. }
  455. $url .= "$link";
  456. }
  457. if ($link_method == 'table')
  458. {
  459. if (($index = array_search($url, $this->_link_list)) === false) {
  460. $index = count($this->_link_list);
  461. $this->_link_list[] = $url;
  462. }
  463. return $display . ' [' . ($index+1) . ']';
  464. }
  465. elseif ($link_method == 'nextline')
  466. {
  467. return $display . "\n[" . $url . ']';
  468. }
  469. else // link_method defaults to inline
  470. {
  471. return $display . ' [' . $url . ']';
  472. }
  473. }
  474. /**
  475. * Helper function for PRE body conversion.
  476. *
  477. * @param string HTML content
  478. * @access private
  479. */
  480. private function _convert_pre(&$text)
  481. {
  482. // get the content of PRE element
  483. while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
  484. $this->pre_content = $matches[1];
  485. // Run our defined tags search-and-replace with callback
  486. $this->pre_content = preg_replace_callback($this->callback_search,
  487. array($this, '_preg_callback'), $this->pre_content);
  488. // convert the content
  489. $this->pre_content = sprintf('<div><br>%s<br></div>',
  490. preg_replace($this->pre_search, $this->pre_replace, $this->pre_content));
  491. // replace the content (use callback because content can contain $0 variable)
  492. $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU',
  493. array($this, '_preg_pre_callback'), $text, 1);
  494. // free memory
  495. $this->pre_content = '';
  496. }
  497. }
  498. /**
  499. * Helper function for BLOCKQUOTE body conversion.
  500. *
  501. * @param string HTML content
  502. * @access private
  503. */
  504. private function _convert_blockquotes(&$text)
  505. {
  506. if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
  507. $level = 0;
  508. $diff = 0;
  509. $start = 0;
  510. $taglen = 0;
  511. foreach ($matches[0] as $m) {
  512. if ($m[0][0] == '<' && $m[0][1] == '/') {
  513. $level--;
  514. if ($level < 0) {
  515. $level = 0; // malformed HTML: go to next blockquote
  516. }
  517. else if ($level > 0) {
  518. // skip inner blockquote
  519. }
  520. else {
  521. $end = $m[1];
  522. $len = $end - $taglen - $start;
  523. // Get blockquote content
  524. $body = substr($text, $start + $taglen - $diff, $len);
  525. // Set text width
  526. $p_width = $this->_options['width'];
  527. if ($this->_options['width'] > 0) $this->_options['width'] -= 2;
  528. // Convert blockquote content
  529. $body = trim($body);
  530. $this->_converter($body);
  531. // Add citation markers and create PRE block
  532. $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
  533. $body = '<pre>' . htmlspecialchars($body) . '</pre>';
  534. // Re-set text width
  535. $this->_options['width'] = $p_width;
  536. // Replace content
  537. $text = substr($text, 0, $start - $diff)
  538. . $body . substr($text, $end + strlen($m[0]) - $diff);
  539. $diff = $len + $taglen + strlen($m[0]) - strlen($body);
  540. unset($body);
  541. }
  542. }
  543. else {
  544. if ($level == 0) {
  545. $start = $m[1];
  546. $taglen = strlen($m[0]);
  547. }
  548. $level ++;
  549. }
  550. }
  551. }
  552. }
  553. /**
  554. * Callback function for preg_replace_callback use.
  555. *
  556. * @param array PREG matches
  557. * @return string
  558. */
  559. private function _preg_callback($matches)
  560. {
  561. switch (strtolower($matches[1])) {
  562. case 'b':
  563. case 'strong':
  564. return $this->_toupper($matches[3]);
  565. case 'th':
  566. return $this->_toupper("\t\t". $matches[3] ."\n");
  567. case 'h':
  568. return $this->_toupper("\n\n". $matches[3] ."\n\n");
  569. case 'a':
  570. // override the link method
  571. $link_override = null;
  572. if (preg_match("/_html2text_link_(\w+)/", $matches[4], $link_override_match))
  573. {
  574. $link_override = $link_override_match[1];
  575. }
  576. // Remove spaces in URL (#1487805)
  577. $url = str_replace(' ', '', $matches[3]);
  578. return $this->_build_link_list($url, $matches[5], $link_override);
  579. }
  580. }
  581. /**
  582. * Callback function for preg_replace_callback use in PRE content handler.
  583. *
  584. * @param array PREG matches
  585. * @return string
  586. */
  587. private function _preg_pre_callback($matches)
  588. {
  589. return $this->pre_content;
  590. }
  591. /**
  592. * Strtoupper function with HTML tags and entities handling.
  593. *
  594. * @param string $str Text to convert
  595. * @return string Converted text
  596. */
  597. private function _toupper($str)
  598. {
  599. // string can containg HTML tags
  600. $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
  601. // convert toupper only the text between HTML tags
  602. foreach ($chunks as $idx => $chunk) {
  603. if ($chunk[0] != '<') {
  604. $chunks[$idx] = $this->_strtoupper($chunk);
  605. }
  606. }
  607. return implode($chunks);
  608. }
  609. /**
  610. * Strtoupper multibyte wrapper function with HTML entities handling.
  611. *
  612. * @param string $str Text to convert
  613. * @return string Converted text
  614. */
  615. private function _strtoupper($str)
  616. {
  617. $str = html_entity_decode($str, ENT_COMPAT);
  618. if (function_exists('mb_strtoupper'))
  619. $str = mb_strtoupper($str);
  620. else
  621. $str = strtoupper($str);
  622. $str = htmlspecialchars($str, ENT_COMPAT);
  623. return $str;
  624. }
  625. }