Using PHP substr() and strip_tags() while retaining formatting and without breaking HTML

烂漫一生 提交于 2019-11-26 20:23:53
code ex machina

Not amazing, but works.

function html_cut($text, $max_length)
{
    $tags   = array();
    $result = "";

    $is_open   = false;
    $grab_open = false;
    $is_close  = false;
    $in_double_quotes = false;
    $in_single_quotes = false;
    $tag = "";

    $i = 0;
    $stripped = 0;

    $stripped_text = strip_tags($text);

    while ($i < strlen($text) && $stripped < strlen($stripped_text) && $stripped < $max_length)
    {
        $symbol  = $text{$i};
        $result .= $symbol;

        switch ($symbol)
        {
           case '<':
                $is_open   = true;
                $grab_open = true;
                break;

           case '"':
               if ($in_double_quotes)
                   $in_double_quotes = false;
               else
                   $in_double_quotes = true;

            break;

            case "'":
              if ($in_single_quotes)
                  $in_single_quotes = false;
              else
                  $in_single_quotes = true;

            break;

            case '/':
                if ($is_open && !$in_double_quotes && !$in_single_quotes)
                {
                    $is_close  = true;
                    $is_open   = false;
                    $grab_open = false;
                }

                break;

            case ' ':
                if ($is_open)
                    $grab_open = false;
                else
                    $stripped++;

                break;

            case '>':
                if ($is_open)
                {
                    $is_open   = false;
                    $grab_open = false;
                    array_push($tags, $tag);
                    $tag = "";
                }
                else if ($is_close)
                {
                    $is_close = false;
                    array_pop($tags);
                    $tag = "";
                }

                break;

            default:
                if ($grab_open || $is_close)
                    $tag .= $symbol;

                if (!$is_open && !$is_close)
                    $stripped++;
        }

        $i++;
    }

    while ($tags)
        $result .= "</".array_pop($tags).">";

    return $result;
}

Usage example:

$content = html_cut($content, 100);
deceze

I'm not claiming to have invented this, but there is a very complete Text::truncate() method in CakePHP which does what you want:

function truncate($text, $length = 100, $ending = '...', $exact = true, $considerHtml = false) {
    if (is_array($ending)) {
        extract($ending);
    }
    if ($considerHtml) {
        if (mb_strlen(preg_replace('/<.*?>/', '', $text)) <= $length) {
            return $text;
        }
        $totalLength = mb_strlen($ending);
        $openTags = array();
        $truncate = '';
        preg_match_all('/(<\/?([\w+]+)[^>]*>)?([^<>]*)/', $text, $tags, PREG_SET_ORDER);
        foreach ($tags as $tag) {
            if (!preg_match('/img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param/s', $tag[2])) {
                if (preg_match('/<[\w]+[^>]*>/s', $tag[0])) {
                    array_unshift($openTags, $tag[2]);
                } else if (preg_match('/<\/([\w]+)[^>]*>/s', $tag[0], $closeTag)) {
                    $pos = array_search($closeTag[1], $openTags);
                    if ($pos !== false) {
                        array_splice($openTags, $pos, 1);
                    }
                }
            }
            $truncate .= $tag[1];

            $contentLength = mb_strlen(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', ' ', $tag[3]));
            if ($contentLength + $totalLength > $length) {
                $left = $length - $totalLength;
                $entitiesLength = 0;
                if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', $tag[3], $entities, PREG_OFFSET_CAPTURE)) {
                    foreach ($entities[0] as $entity) {
                        if ($entity[1] + 1 - $entitiesLength <= $left) {
                            $left--;
                            $entitiesLength += mb_strlen($entity[0]);
                        } else {
                            break;
                        }
                    }
                }

                $truncate .= mb_substr($tag[3], 0 , $left + $entitiesLength);
                break;
            } else {
                $truncate .= $tag[3];
                $totalLength += $contentLength;
            }
            if ($totalLength >= $length) {
                break;
            }
        }

    } else {
        if (mb_strlen($text) <= $length) {
            return $text;
        } else {
            $truncate = mb_substr($text, 0, $length - strlen($ending));
        }
    }
    if (!$exact) {
        $spacepos = mb_strrpos($truncate, ' ');
        if (isset($spacepos)) {
            if ($considerHtml) {
                $bits = mb_substr($truncate, $spacepos);
                preg_match_all('/<\/([a-z]+)>/', $bits, $droppedTags, PREG_SET_ORDER);
                if (!empty($droppedTags)) {
                    foreach ($droppedTags as $closingTag) {
                        if (!in_array($closingTag[1], $openTags)) {
                            array_unshift($openTags, $closingTag[1]);
                        }
                    }
                }
            }
            $truncate = mb_substr($truncate, 0, $spacepos);
        }
    }

    $truncate .= $ending;

    if ($considerHtml) {
        foreach ($openTags as $tag) {
            $truncate .= '</'.$tag.'>';
        }
    }

    return $truncate;
}
leepowers

Use PHP's DOMDocument class to normalize an HTML fragment:

$dom= new DOMDocument();
$dom->loadHTML('<div><p>Hello World');      
$xpath = new DOMXPath($dom);
$body = $xpath->query('/html/body');
echo($dom->saveXml($body->item(0)));

This question is similar to an earlier question and I've copied and pasted one solution here. If the HTML is submitted by users you'll also need to filter out potential Javascript attack vectors like onmouseover="do_something_evil()" or <a href="javascript:more_evil();">...</a>. Tools like HTML Purifier were designed to catch and solve these problems and are far more comprehensive than any code that I could post.

Use a HTML parser and stop after 100 characters of text.

metrobalderas

You should use Tidy HTML. You cut the string and then you run Tidy to close the tags.

(Credits where credits are due)

I made another function to do it, it supports UTF-8:

/**
 * Limit string without break html tags.
 * Supports UTF8
 * 
 * @param string $value
 * @param int $limit Default 100
 */
function str_limit_html($value, $limit = 100)
{

    if (mb_strwidth($value, 'UTF-8') <= $limit) {
        return $value;
    }

    // Strip text with HTML tags, sum html len tags too.
    // Is there another way to do it?
    do {
        $len          = mb_strwidth($value, 'UTF-8');
        $len_stripped = mb_strwidth(strip_tags($value), 'UTF-8');
        $len_tags     = $len - $len_stripped;

        $value = mb_strimwidth($value, 0, $limit + $len_tags, '', 'UTF-8');
    } while ($len_stripped > $limit);

    // Load as HTML ignoring errors
    $dom = new DOMDocument();
    @$dom->loadHTML('<?xml encoding="utf-8" ?>'.$value, LIBXML_HTML_NODEFDTD);

    // Fix the html errors
    $value = $dom->saveHtml($dom->getElementsByTagName('body')->item(0));

    // Remove body tag
    $value = mb_strimwidth($value, 6, mb_strwidth($value, 'UTF-8') - 13, '', 'UTF-8'); // <body> and </body>
    // Remove empty tags
    return preg_replace('/<(\w+)\b(?:\s+[\w\-.:]+(?:\s*=\s*(?:"[^"]*"|"[^"]*"|[\w\-.:]+))?)*\s*\/?>\s*<\/\1\s*>/', '', $value);
}

SEE DEMO.

I recommend use html_entity_decode at the start of function, so it preserves the UTF-8 characters:

 $value = html_entity_decode($value);

Regardless of the 100 count issues you state at the beginning, you indicate in the challenge the following:

  • output the character count of strip_tags (the number of characters in the actual displayed text of the HTML)
  • retain HTML formatting close
  • any unfinished HTML tag

Here is my proposal: Bascially, I parse through each character counting as I go. I make sure NOT to count any characters in any HTML tag. I also check at the end to make sure I am not in the middle of a word when I stop. Once I stop, I back track to the first available SPACE or > as a stopping point.

$position = 0;
$length = strlen($content)-1;

// process the content putting each 100 character section into an array
while($position < $length)
{
    $next_position = get_position($content, $position, 100);
    $data[] = substr($content, $position, $next_position);
    $position = $next_position;
}

// show the array
print_r($data);

function get_position($content, $position, $chars = 100)
{
    $count = 0;
    // count to 100 characters skipping over all of the HTML
    while($count <> $chars){
        $char = substr($content, $position, 1); 
        if($char == '<'){
            do{
                $position++;
                $char = substr($content, $position, 1);
            } while($char !== '>');
            $position++;
            $char = substr($content, $position, 1);
        }
        $count++;
        $position++;
    }
echo $count."\n";
    // find out where there is a logical break before 100 characters
    $data = substr($content, 0, $position);

    $space = strrpos($data, " ");
    $tag = strrpos($data, ">");

    // return the position of the logical break
    if($space > $tag)
    {
        return $space;
    } else {
        return $tag;
    }  
}

This will also count the return codes etc. Considering they will take space, I have not removed them.

Here is a function I'm using in one of my projects. It's based on DOMDocument, works with HTML5 and is about 2x faster than other solutions I've tried (at least on my machine, 0.22 ms vs 0.43 ms using html_cut($text, $max_length) from the top answer on a 500 text-node-characters string with a limit of 400).

function cut_html ($html, $limit) {
    $dom = new DOMDocument();
    $dom->loadHTML(mb_convert_encoding("<div>{$html}</div>", "HTML-ENTITIES", "UTF-8"), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
    cut_html_recursive($dom->documentElement, $limit);
    return substr($dom->saveHTML($dom->documentElement), 5, -6);
}

function cut_html_recursive ($element, $limit) {
    if($limit > 0) {
        if($element->nodeType == 3) {
            $limit -= strlen($element->nodeValue);
            if($limit < 0) {
                $element->nodeValue = substr($element->nodeValue, 0, strlen($element->nodeValue) + $limit);
            }
        }
        else {
            for($i = 0; $i < $element->childNodes->length; $i++) {
                if($limit > 0) {
                    $limit = cut_html_recursive($element->childNodes->item($i), $limit);
                }
                else {
                    $element->removeChild($element->childNodes->item($i));
                    $i--;
                }
            }
        }
    }
    return $limit;
}

Here is my try at the cutter. Maybe you guys can catch some bugs. The problem, i found with the other parsers, is that they don't close tags properly and they cut in the middle of a word (blah)

function cutHTML($string, $length, $patternsReplace = false) {
    $i = 0;
    $count = 0;
    $isParagraphCut = false;
    $htmlOpen = false;
    $openTag = false;
    $tagsStack = array();

    while ($i < strlen($string)) {
        $char = substr($string, $i, 1);
        if ($count >= $length) {
            $isParagraphCut = true;
            break;
        }

        if ($htmlOpen) {
            if ($char === ">") {
                $htmlOpen = false;
            }
        } else {
            if ($char === "<") {
                $j = $i;
                $char = substr($string, $j, 1);

                while ($j < strlen($string)) {
                    if($char === '/'){
                        $i++;
                        break;
                    }
                    elseif ($char === ' ') {
                        $tagsStack[] = substr($string, $i, $j);
                    }
                    $j++;
                }
                $htmlOpen = true;
            }
        }

        if (!$htmlOpen && $char != ">") {
            $count++;
        }

        $i++;
    }

    if ($isParagraphCut) {
        $j = $i;
        while ($j > 0) {
            $char = substr($string, $j, 1);
            if ($char === " " || $char === ";" || $char === "." || $char === "," || $char === "<" || $char === "(" || $char === "[") {
                break;
            } else if ($char === ">") {
                $j++;
                break;
            }
            $j--;
        }
        $string = substr($string, 0, $j);
        foreach($tagsStack as $tag){
            $tag = strtolower($tag);
            if($tag !== "img" && $tag !== "br"){
                $string .= "</$tag>";
            }
        }
        $string .= "...";
    }

    if ($patternsReplace) {
        foreach ($patternsReplace as $value) {
            if (isset($value['pattern']) && isset($value["replace"])) {
                $string = preg_replace($value["pattern"], $value["replace"], $string);
            }
        }
    }
    return $string;
}

try this function

// trim the string function
function trim_word($text, $length, $startPoint=0, $allowedTags=""){
    $text = html_entity_decode(htmlspecialchars_decode($text));
    $text = strip_tags($text, $allowedTags);
    return $text = substr($text, $startPoint, $length);
}

and

echo trim_word("<h2 class='zzzz'>abcasdsdasasdas</h2>","6");
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!