I have various HTML strings to cut to 100 characters (of the stripped content, not the original) without stripping tags and without breaking HTML.
Original H
Not amazing, but works.
function html_cut($text, $max_length)
{
$tags = array();
$result = "";
$is_open = false;
$grab_open = false;
$is_close = false;
$in_double_quotes = false;
$in_single_quotes = false;
$tag = "";
$i = 0;
$stripped = 0;
$stripped_text = strip_tags($text);
while ($i < strlen($text) && $stripped < strlen($stripped_text) && $stripped < $max_length)
{
$symbol = $text{$i};
$result .= $symbol;
switch ($symbol)
{
case '<':
$is_open = true;
$grab_open = true;
break;
case '"':
if ($in_double_quotes)
$in_double_quotes = false;
else
$in_double_quotes = true;
break;
case "'":
if ($in_single_quotes)
$in_single_quotes = false;
else
$in_single_quotes = true;
break;
case '/':
if ($is_open && !$in_double_quotes && !$in_single_quotes)
{
$is_close = true;
$is_open = false;
$grab_open = false;
}
break;
case ' ':
if ($is_open)
$grab_open = false;
else
$stripped++;
break;
case '>':
if ($is_open)
{
$is_open = false;
$grab_open = false;
array_push($tags, $tag);
$tag = "";
}
else if ($is_close)
{
$is_close = false;
array_pop($tags);
$tag = "";
}
break;
default:
if ($grab_open || $is_close)
$tag .= $symbol;
if (!$is_open && !$is_close)
$stripped++;
}
$i++;
}
while ($tags)
$result .= "</".array_pop($tags).">";
return $result;
}
Usage example:
$content = html_cut($content, 100);
Use a HTML parser and stop after 100 characters of text.
try this function
// trim the string function
function trim_word($text, $length, $startPoint=0, $allowedTags=""){
$text = html_entity_decode(htmlspecialchars_decode($text));
$text = strip_tags($text, $allowedTags);
return $text = substr($text, $startPoint, $length);
}
and
echo trim_word("<h2 class='zzzz'>abcasdsdasasdas</h2>","6");
I'm not claiming to have invented this, but there is a very complete Text::truncate() method in CakePHP which does what you want:
function truncate($text, $length = 100, $ending = '...', $exact = true, $considerHtml = false) {
if (is_array($ending)) {
extract($ending);
}
if ($considerHtml) {
if (mb_strlen(preg_replace('/<.*?>/', '', $text)) <= $length) {
return $text;
}
$totalLength = mb_strlen($ending);
$openTags = array();
$truncate = '';
preg_match_all('/(<\/?([\w+]+)[^>]*>)?([^<>]*)/', $text, $tags, PREG_SET_ORDER);
foreach ($tags as $tag) {
if (!preg_match('/img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param/s', $tag[2])) {
if (preg_match('/<[\w]+[^>]*>/s', $tag[0])) {
array_unshift($openTags, $tag[2]);
} else if (preg_match('/<\/([\w]+)[^>]*>/s', $tag[0], $closeTag)) {
$pos = array_search($closeTag[1], $openTags);
if ($pos !== false) {
array_splice($openTags, $pos, 1);
}
}
}
$truncate .= $tag[1];
$contentLength = mb_strlen(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', ' ', $tag[3]));
if ($contentLength + $totalLength > $length) {
$left = $length - $totalLength;
$entitiesLength = 0;
if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', $tag[3], $entities, PREG_OFFSET_CAPTURE)) {
foreach ($entities[0] as $entity) {
if ($entity[1] + 1 - $entitiesLength <= $left) {
$left--;
$entitiesLength += mb_strlen($entity[0]);
} else {
break;
}
}
}
$truncate .= mb_substr($tag[3], 0 , $left + $entitiesLength);
break;
} else {
$truncate .= $tag[3];
$totalLength += $contentLength;
}
if ($totalLength >= $length) {
break;
}
}
} else {
if (mb_strlen($text) <= $length) {
return $text;
} else {
$truncate = mb_substr($text, 0, $length - strlen($ending));
}
}
if (!$exact) {
$spacepos = mb_strrpos($truncate, ' ');
if (isset($spacepos)) {
if ($considerHtml) {
$bits = mb_substr($truncate, $spacepos);
preg_match_all('/<\/([a-z]+)>/', $bits, $droppedTags, PREG_SET_ORDER);
if (!empty($droppedTags)) {
foreach ($droppedTags as $closingTag) {
if (!in_array($closingTag[1], $openTags)) {
array_unshift($openTags, $closingTag[1]);
}
}
}
}
$truncate = mb_substr($truncate, 0, $spacepos);
}
}
$truncate .= $ending;
if ($considerHtml) {
foreach ($openTags as $tag) {
$truncate .= '</'.$tag.'>';
}
}
return $truncate;
}
I made another function to do it, it supports UTF-8:
/**
* Limit string without break html tags.
* Supports UTF8
*
* @param string $value
* @param int $limit Default 100
*/
function str_limit_html($value, $limit = 100)
{
if (mb_strwidth($value, 'UTF-8') <= $limit) {
return $value;
}
// Strip text with HTML tags, sum html len tags too.
// Is there another way to do it?
do {
$len = mb_strwidth($value, 'UTF-8');
$len_stripped = mb_strwidth(strip_tags($value), 'UTF-8');
$len_tags = $len - $len_stripped;
$value = mb_strimwidth($value, 0, $limit + $len_tags, '', 'UTF-8');
} while ($len_stripped > $limit);
// Load as HTML ignoring errors
$dom = new DOMDocument();
@$dom->loadHTML('<?xml encoding="utf-8" ?>'.$value, LIBXML_HTML_NODEFDTD);
// Fix the html errors
$value = $dom->saveHtml($dom->getElementsByTagName('body')->item(0));
// Remove body tag
$value = mb_strimwidth($value, 6, mb_strwidth($value, 'UTF-8') - 13, '', 'UTF-8'); // <body> and </body>
// Remove empty tags
return preg_replace('/<(\w+)\b(?:\s+[\w\-.:]+(?:\s*=\s*(?:"[^"]*"|"[^"]*"|[\w\-.:]+))?)*\s*\/?>\s*<\/\1\s*>/', '', $value);
}
SEE DEMO.
I recommend use html_entity_decode
at the start of function, so it preserves the UTF-8 characters:
$value = html_entity_decode($value);
Here is a function I'm using in one of my projects. It's based on DOMDocument, works with HTML5 and is about 2x faster than other solutions I've tried (at least on my machine, 0.22 ms vs 0.43 ms using html_cut($text, $max_length)
from the top answer on a 500 text-node-characters string with a limit of 400).
function cut_html ($html, $limit) {
$dom = new DOMDocument();
$dom->loadHTML(mb_convert_encoding("<div>{$html}</div>", "HTML-ENTITIES", "UTF-8"), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
cut_html_recursive($dom->documentElement, $limit);
return substr($dom->saveHTML($dom->documentElement), 5, -6);
}
function cut_html_recursive ($element, $limit) {
if($limit > 0) {
if($element->nodeType == 3) {
$limit -= strlen($element->nodeValue);
if($limit < 0) {
$element->nodeValue = substr($element->nodeValue, 0, strlen($element->nodeValue) + $limit);
}
}
else {
for($i = 0; $i < $element->childNodes->length; $i++) {
if($limit > 0) {
$limit = cut_html_recursive($element->childNodes->item($i), $limit);
}
else {
$element->removeChild($element->childNodes->item($i));
$i--;
}
}
}
}
return $limit;
}