PHP Method To Extract Text From HTML (Tag-Based) Content

Posted at

<?php
  while (ob_get_level() > 0) ob_end_flush();

  date_default_timezone_set("Asia/Jerusalem");

  mb_language("uni");
  mb_internal_encoding('UTF-8');
  setlocale(LC_ALL, 'en_US.UTF-8');
  header('Charset: UTF-8');
  header('Content-Language: en');
  header('Content-Encoding: UTF-8');
  header('Content-Type: text/plain; charset=UTF-8');

  header('Access-Control-Allow-Origin: *');
  header('X-UA-Compatible: IE=edge,chrome=1');
  header('Viewport: width=device-width, initial-scale=1.0');


  $html = file_get_contents('./demo.html.txt');
  
  
  $replacements = [
  "#<!--(.*?)-->#is" => ""
  ,"#<head(.*?)>(.*?)</head>#is" => ""
  ,"#<script(.*?)>(.*?)</script>#is" => ""
  ,"#<iframe(.*?)>(.*?)</iframe>#is" => ""
  ,"#<noscript(.*?)>(.*?)</noscript>#is" => ""
  ,"#<style(.*?)>(.*?)</style>#is"=>""
  ,"#<meta(.*?)>#is"=>""
  ,"#<link(.*?)>#is"=>""  
  ];
  $html = preg_replace(array_keys($replacements),array_values($replacements),$html);

  $html = strip_tags($html);
  $replacements = [
    "#\d+#is"=>""  
    ,"#[\(\)\[\]\,\.\-\_\!\&\;\:\#]*#is"=>""  
    ,"#\n+#is"=>"\n"
    ,"#\r+#is"=>""
    ,"#\s+#is"=>" "
  ];
  $html = preg_replace(array_keys($replacements),array_values($replacements),$html);

  $html = filter_var($html, FILTER_SANITIZE_SPECIAL_CHARS, FILTER_FLAG_STRIP_LOW);
  $html = filter_var($html, FILTER_SANITIZE_SPECIAL_CHARS, FILTER_FLAG_STRIP_HIGH);
  $html = filter_var($html, FILTER_SANITIZE_STRING);
  $html = filter_var($html, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_LOW);
  $html = filter_var($html, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH);


  echo $html;
  
  
  $patterns1 = [
    "/\r/"                                            => ''                                     // Non-legal carriage return
    , "/[\n\t]+/"                                     => ' '                             // Newlines and tabs
    , '/[ ]{2,}/'                                     => ' '                             // Runs of spaces, pre-handling
    , '/<script[^>]*>.*?<\/script>/i'                 => ''         // <script>s -- which strip_tags supposedly has problems with
    , '/<style[^>]*>.*?<\/style>/i'                   => ''           // <style>s -- which strip_tags supposedly has problems with
    , '/<!-- .* -->/'                                 => ''                         // Comments -- which strip_tags might have problem a with

    //, '/<h[123][^>]*>(.*?)<\/h[123]>/ie'              => mb_strtoupper("\n\n\\1\n\n")      // H1 - H3
    //, '/<h[456][^>]*>(.*?)<\/h[456]>/ie'              => ucwords("\n\n\\1\n\n")      // H4 - H6
    , '/<p[^>]*>/i'                                   => "\n\n\t"                           // <P>
    , '/<br[^>]*>/i'                                  => "\n"                          // <br>
    //, '/<b[^>]*>(.*?)<\/b>/ie'                        => mb_strtoupper("\\1")                // <b>
    //, '/<strong[^>]*>(.*?)<\/strong>/ie'              => mb_strtoupper("\\1")      // <strong>
    , '/<i[^>]*>(.*?)<\/i>/i'                         => '_\\1_'                 // <i>
    , '/<em[^>]*>(.*?)<\/em>/i'                       => '_\\1_'           // <em>
    , '/(<ul[^>]*>|<\/ul>)/i'                         => "\n\n"                // <ul> and </ul>
    , '/(<ol[^>]*>|<\/ol>)/i'                         => "\n\n"        // <ol> and </ol>
    , '/<li[^>]*>(.*?)<\/li>/i'                       => "\t* \\1\n"          // <li> and </li>
    , '/<li[^>]*>/i'                                  => "\n\t* "           // <li>
    //, '/<a [^>]*href="([^"]+)"[^>]*>(.*?)<\/a>/ie'    => "\\2"//'$this->_build_link_list("\\1", "\\2")'
    , '/<hr[^>]*>/i'                                  => "\n-------------------------\n"                   // <hr>
    , '/(<table[^>]*>|<\/table>)/i'                   => "\n\n"           // <table> and </table>
    , '/(<tr[^>]*>|<\/tr>)/i'                         => "\n"            // <tr> and </tr>
    , '/<td[^>]*>(.*?)<\/td>/i'                       => "\t\t\\1\n"     // <td> and </td>
    //, '/<th[^>]*>(.*?)<\/th>/ie'                      => mb_strtoupper("\t\t\\1\n")      // <th> and </th>
    , '/&(nbsp|#160);/i'                              => ' '                  // Non-breaking space
    , '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i' => '"'  // Double quotes
    , '/&(apos|rsquo|lsquo|#8216|#8217);/i'           => "'" // Single quotes
    , '/>/i'                                       => '>' // Greater-than
    , '/</i'                                       => '<'      // Less-than
    , '/&(amp|#38);/i'                                => '&'  // Ampersand
    , '/&(copy|#169);/i'                              => '(c)'             // Copyright
    , '/&(trade|#8482|#153);/i'                       => '(tm)'      // Trademark
    , '/&(reg|#174);/i'                               => '(R)'   // Registered
    , '/&(mdash|#151|#8212);/i'                       => '--'      // mdash
    , '/&(ndash|minus|#8211|#8722);/i'                => '-' // ndash
    , '/&(bull|#149|#8226);/i'                        => '*'         // Bullet
    , '/&(pound|#163);/i'                             => '�'          // Pound sign
    , '/&(euro|#8364);/i'                             => 'EUR'     // Euro sign
    , '/&[^&;]+;/i'                                   => ''          // Unknown/unhandled entities
    , '/[ ]{2,}/'                                     => ' '                   // Runs of spaces, post-handling
  ];
  //$html = preg_replace(array_keys($patterns), array_values($patterns), $html);

  //$html = strip_tags($html);

  echo $html;


https://gist.github.com/eladkarako/5e94a8467677057e70378eb6131af954#file-html2text-php