remove HTML tags from cell strings : excel Formula

后端 未结 3 1733
感动是毒 2020-12-16 21:28

I have a Data with HTML tags in excel sheet like below:

This is test data
Go on this is next Cell Very goood ...
  • 2020-12-16 21:55

    Apply Replace All with <*> pattern:

    replace tags pattern

    To open this go to Ribbon Home > Find & Select > Replace... or simply press CTRL+H.

    Extra spaces may be further removed using TRIM function. Good luck!

    0 讨论(0)
  • 2020-12-16 22:02

    Open VBA in Excel (Alt +F11), click on the project name (spreadsheet name) in the project explorer on the right. Insert -> New Module. Paste the user defined function below into the module Window. Save as .XLSM which allow Macros.

    type the function '=StripHTML(A2)' assuming your data is in cell A2. You can also download a working example here:

    Function StripHTML(cell As Range) As String
        Dim RegEx As Object
        Set RegEx = CreateObject("vbscript.regexp")
        Dim sInput As String
        Dim sOut As String
        sInput = cell.Text
        sInput = Replace(sInput, "\x0D\x0A", Chr(10))
        sInput = Replace(sInput, "\x00", Chr(10))
        'replace HTML breaks and end of paragraphs with line breaks
        sInput = Replace(sInput, "</P>", Chr(10) & Chr(10))
        sInput = Replace(sInput, "<BR>", Chr(10))
        'replace bullets with dashes
        sInput = Replace(sInput, "<li>", "-")
        'add back all of the special characters
        sInput = Replace(sInput, "&ndash;", "–")
        sInput = Replace(sInput, "&mdash;", "—")
        sInput = Replace(sInput, "&iexcl;", "¡")
        sInput = Replace(sInput, "&iquest;", "¿")
        sInput = Replace(sInput, "&quot;", "")
        sInput = Replace(sInput, "&ldquo;", "")
        sInput = Replace(sInput, "&rdquo;", "")
        sInput = Replace(sInput, "", "'")
        sInput = Replace(sInput, "&lsquo;", "'")
        sInput = Replace(sInput, "&rsquo;", "’")
        sInput = Replace(sInput, "&laquo;", "«")
        sInput = Replace(sInput, "&raquo;", "»")
        sInput = Replace(sInput, "&nbsp;", " ")
        sInput = Replace(sInput, "&amp;", "&")
        sInput = Replace(sInput, "&cent;", "¢")
        sInput = Replace(sInput, "&copy;", "©")
        sInput = Replace(sInput, "&divide;", "÷")
        sInput = Replace(sInput, "&gt;", ">")
        sInput = Replace(sInput, "&lt;", "<")
        sInput = Replace(sInput, "&micro;", "µ")
        sInput = Replace(sInput, "&middot;", "·")
        sInput = Replace(sInput, "&para;", "¶")
        sInput = Replace(sInput, "&plusmn;", "±")
        sInput = Replace(sInput, "&euro;", "€")
        sInput = Replace(sInput, "&pound;", "£")
        sInput = Replace(sInput, "&reg;", "®")
        sInput = Replace(sInput, "&sect;", "§")
        sInput = Replace(sInput, "&trade;", "™")
        sInput = Replace(sInput, "&yen;", "¥")
        sInput = Replace(sInput, "&aacute;", "á")
        sInput = Replace(sInput, "&Aacute;", "Á")
        sInput = Replace(sInput, "&agrave;", "à")
        sInput = Replace(sInput, "&Agrave;", "À")
        sInput = Replace(sInput, "&acirc;", "â")
        sInput = Replace(sInput, "&Acirc;", "Â")
        sInput = Replace(sInput, "&aring;", "å")
        sInput = Replace(sInput, "&Aring;", "Å")
        sInput = Replace(sInput, "&atilde;", "ã")
        sInput = Replace(sInput, "&Atilde;", "Ã")
        sInput = Replace(sInput, "&auml;", "ä")
        sInput = Replace(sInput, "&Auml;", "Ä")
        sInput = Replace(sInput, "&aelig;", "æ")
        sInput = Replace(sInput, "&AElig;", "Æ")
        sInput = Replace(sInput, "&ccedil;", "ç")
        sInput = Replace(sInput, "&Ccedil;", "Ç")
        sInput = Replace(sInput, "&eacute;", "é")
        sInput = Replace(sInput, "&Eacute;", "É")
        sInput = Replace(sInput, "&egrave;", "è")
        sInput = Replace(sInput, "&Egrave;", "È")
        sInput = Replace(sInput, "&ecirc;", "ê")
        sInput = Replace(sInput, "&Ecirc;", "Ê")
        sInput = Replace(sInput, "&euml;", "ë")
        sInput = Replace(sInput, "&Euml;", "Ë")
        sInput = Replace(sInput, "&iacute;", "í")
        sInput = Replace(sInput, "&Iacute;", "Í")
        sInput = Replace(sInput, "&igrave;", "ì")
        sInput = Replace(sInput, "&Igrave;", "Ì")
        sInput = Replace(sInput, "&icirc;", "î")
        sInput = Replace(sInput, "&Icirc;", "Î")
        sInput = Replace(sInput, "&iuml;", "ï")
        sInput = Replace(sInput, "&Iuml;", "Ï")
        sInput = Replace(sInput, "&ntilde;", "ñ")
        sInput = Replace(sInput, "&Ntilde;", "Ñ")
        sInput = Replace(sInput, "&oacute;", "ó")
        sInput = Replace(sInput, "&Oacute;", "Ó")
        sInput = Replace(sInput, "&ograve;", "ò")
        sInput = Replace(sInput, "&Ograve;", "Ò")
        sInput = Replace(sInput, "&ocirc;", "ô")
        sInput = Replace(sInput, "&Ocirc;", "Ô")
        sInput = Replace(sInput, "&oslash;", "ø")
        sInput = Replace(sInput, "&Oslash;", "Ø")
        sInput = Replace(sInput, "&otilde;", "õ")
        sInput = Replace(sInput, "&Otilde;", "Õ")
        sInput = Replace(sInput, "&ouml;", "ö")
        sInput = Replace(sInput, "&Ouml;", "Ö")
        sInput = Replace(sInput, "&szlig;", "ß")
        sInput = Replace(sInput, "&uacute;", "ú")
        sInput = Replace(sInput, "&Uacute;", "Ú")
        sInput = Replace(sInput, "&ugrave;", "ù")
        sInput = Replace(sInput, "&Ugrave;", "Ù")
        sInput = Replace(sInput, "&ucirc;", "û")
        sInput = Replace(sInput, "&Ucirc;", "Û")
        sInput = Replace(sInput, "&uuml;", "ü")
        sInput = Replace(sInput, "&Uuml;", "Ü")
        sInput = Replace(sInput, "&yuml;", "ÿ")
        sInput = Replace(sInput, "", "´")
        sInput = Replace(sInput, "", "`")
        'replace all the remaining HTML Tags
        With RegEx
        .Global = True
        .IgnoreCase = True
        .MultiLine = True
        .Pattern = "<[^>]+>" 'Regular Expression for HTML Tags.
        End With
        sOut = RegEx.Replace(sInput, "")
        StripHTML = sOut
        Set RegEx = Nothing
        End Function
    0 讨论(0)
  • 2020-12-16 22:06

    Since the macro above didn't work for me I fixed it myself. It's my first script, if you guys can improve it, make it faster, add more then you're more than welcome!

    Ok guys, I've had no previous experience programming (except for some very basic Java 6 years ago) but with some help, lots of guessing (hours actually) I managed to make this script, it works like a charm to remove most and 8#text but it does not replace <BR> with linebreak (you can do this by hitting CTRL + H, "find: <br>" "replace: (now hold ALT down and use type 0010 with your NUMPAD. A small dot should be blinking in the replace window, then hit "replace all").

    Paste the code below into a user module (alt +f11, right click Sheet1->insert->Module->paste code)

    And make a button by going File->Options->Customize Ribbon-> check the Developer checkbox. Then go to developer tab->Insert->Button-> then place the button and right click->assign macro-> Choose RemoveTags.

    Sub RemoveTags()
        Dim r As Range
        Selection.NumberFormat = "@"  'set cells to text numberformat
        With CreateObject("vbscript.regexp")
          .Pattern = "\<.*?\>"
          .Global = True
          For Each r In Selection
            r.Value = Replace(.Replace(r.Value, ""), "&#8217;", " ")
            r.Value2 = Replace(.Replace(r.Value2, ""), "&#8211;", " ")
          Next r
          For Each r In Selection
            r.Value = Replace(.Replace(r.Value, ""), "&#8216;", " ")
            r.Value2 = Replace(.Replace(r.Value2, ""), "&#8232;", " ")
          Next r
          For Each r In Selection
            r.Value = Replace(.Replace(r.Value, ""), "&#8233;", " ")
            r.Value2 = Replace(.Replace(r.Value2, ""), "&#146;s", " ")
          Next r
        End With
    End Sub
    Private Sub CommandButton1_Click()
    End Sub
    0 讨论(0)