I have a Data with HTML tags in excel sheet like below:
This is test data
Nice
Go on this is next Cell
Very goood ...
Apply Replace All
with <*>
pattern:
To open this go to Ribbon Home > Find & Select > Replace...
or simply press CTRL+H.
Extra spaces may be further removed using TRIM
function. Good luck!
Open VBA in Excel (Alt +F11), click on the project name (spreadsheet name) in the project explorer on the right. Insert -> New Module. Paste the user defined function below into the module Window. Save as .XLSM which allow Macros.
type the function '=StripHTML(A2)' assuming your data is in cell A2. You can also download a working example here:
http://jfrancisconsulting.com/how-to-strip-html-tags-in-excel/
Function StripHTML(cell As Range) As String
Dim RegEx As Object
Set RegEx = CreateObject("vbscript.regexp")
Dim sInput As String
Dim sOut As String
sInput = cell.Text
sInput = Replace(sInput, "\x0D\x0A", Chr(10))
sInput = Replace(sInput, "\x00", Chr(10))
'replace HTML breaks and end of paragraphs with line breaks
sInput = Replace(sInput, "</P>", Chr(10) & Chr(10))
sInput = Replace(sInput, "<BR>", Chr(10))
'replace bullets with dashes
sInput = Replace(sInput, "<li>", "-")
'add back all of the special characters
sInput = Replace(sInput, "–", "–")
sInput = Replace(sInput, "—", "—")
sInput = Replace(sInput, "¡", "¡")
sInput = Replace(sInput, "¿", "¿")
sInput = Replace(sInput, """, "")
sInput = Replace(sInput, "“", "")
sInput = Replace(sInput, "”", "")
sInput = Replace(sInput, "", "'")
sInput = Replace(sInput, "‘", "'")
sInput = Replace(sInput, "’", "’")
sInput = Replace(sInput, "«", "«")
sInput = Replace(sInput, "»", "»")
sInput = Replace(sInput, " ", " ")
sInput = Replace(sInput, "&", "&")
sInput = Replace(sInput, "¢", "¢")
sInput = Replace(sInput, "©", "©")
sInput = Replace(sInput, "÷", "÷")
sInput = Replace(sInput, ">", ">")
sInput = Replace(sInput, "<", "<")
sInput = Replace(sInput, "µ", "µ")
sInput = Replace(sInput, "·", "·")
sInput = Replace(sInput, "¶", "¶")
sInput = Replace(sInput, "±", "±")
sInput = Replace(sInput, "€", "€")
sInput = Replace(sInput, "£", "£")
sInput = Replace(sInput, "®", "®")
sInput = Replace(sInput, "§", "§")
sInput = Replace(sInput, "™", "™")
sInput = Replace(sInput, "¥", "¥")
sInput = Replace(sInput, "á", "á")
sInput = Replace(sInput, "Á", "Á")
sInput = Replace(sInput, "à", "à")
sInput = Replace(sInput, "À", "À")
sInput = Replace(sInput, "â", "â")
sInput = Replace(sInput, "Â", "Â")
sInput = Replace(sInput, "å", "å")
sInput = Replace(sInput, "Å", "Å")
sInput = Replace(sInput, "ã", "ã")
sInput = Replace(sInput, "Ã", "Ã")
sInput = Replace(sInput, "ä", "ä")
sInput = Replace(sInput, "Ä", "Ä")
sInput = Replace(sInput, "æ", "æ")
sInput = Replace(sInput, "Æ", "Æ")
sInput = Replace(sInput, "ç", "ç")
sInput = Replace(sInput, "Ç", "Ç")
sInput = Replace(sInput, "é", "é")
sInput = Replace(sInput, "É", "É")
sInput = Replace(sInput, "è", "è")
sInput = Replace(sInput, "È", "È")
sInput = Replace(sInput, "ê", "ê")
sInput = Replace(sInput, "Ê", "Ê")
sInput = Replace(sInput, "ë", "ë")
sInput = Replace(sInput, "Ë", "Ë")
sInput = Replace(sInput, "í", "í")
sInput = Replace(sInput, "Í", "Í")
sInput = Replace(sInput, "ì", "ì")
sInput = Replace(sInput, "Ì", "Ì")
sInput = Replace(sInput, "î", "î")
sInput = Replace(sInput, "Î", "Î")
sInput = Replace(sInput, "ï", "ï")
sInput = Replace(sInput, "Ï", "Ï")
sInput = Replace(sInput, "ñ", "ñ")
sInput = Replace(sInput, "Ñ", "Ñ")
sInput = Replace(sInput, "ó", "ó")
sInput = Replace(sInput, "Ó", "Ó")
sInput = Replace(sInput, "ò", "ò")
sInput = Replace(sInput, "Ò", "Ò")
sInput = Replace(sInput, "ô", "ô")
sInput = Replace(sInput, "Ô", "Ô")
sInput = Replace(sInput, "ø", "ø")
sInput = Replace(sInput, "Ø", "Ø")
sInput = Replace(sInput, "õ", "õ")
sInput = Replace(sInput, "Õ", "Õ")
sInput = Replace(sInput, "ö", "ö")
sInput = Replace(sInput, "Ö", "Ö")
sInput = Replace(sInput, "ß", "ß")
sInput = Replace(sInput, "ú", "ú")
sInput = Replace(sInput, "Ú", "Ú")
sInput = Replace(sInput, "ù", "ù")
sInput = Replace(sInput, "Ù", "Ù")
sInput = Replace(sInput, "û", "û")
sInput = Replace(sInput, "Û", "Û")
sInput = Replace(sInput, "ü", "ü")
sInput = Replace(sInput, "Ü", "Ü")
sInput = Replace(sInput, "ÿ", "ÿ")
sInput = Replace(sInput, "", "´")
sInput = Replace(sInput, "", "`")
'replace all the remaining HTML Tags
With RegEx
.Global = True
.IgnoreCase = True
.MultiLine = True
.Pattern = "<[^>]+>" 'Regular Expression for HTML Tags.
End With
sOut = RegEx.Replace(sInput, "")
StripHTML = sOut
Set RegEx = Nothing
End Function
Since the macro above didn't work for me I fixed it myself. It's my first script, if you guys can improve it, make it faster, add more then you're more than welcome!
Ok guys, I've had no previous experience programming (except for some very basic Java 6 years ago) but with some help, lots of guessing (hours actually) I managed to make this script, it works like a charm to remove most and 8#text but it does not replace <BR>
with linebreak (you can do this by hitting CTRL + H, "find: <br>
" "replace: (now hold ALT down and use type 0010 with your NUMPAD. A small dot should be blinking in the replace window, then hit "replace all").
Paste the code below into a user module (alt +f11, right click Sheet1->insert->Module->paste code)
And make a button by going File->Options->Customize Ribbon-> check the Developer checkbox. Then go to developer tab->Insert->Button-> then place the button and right click->assign macro-> Choose RemoveTags.
Sub RemoveTags()
Dim r As Range
Selection.NumberFormat = "@" 'set cells to text numberformat
With CreateObject("vbscript.regexp")
.Pattern = "\<.*?\>"
.Global = True
For Each r In Selection
r.Value = Replace(.Replace(r.Value, ""), "’", " ")
r.Value2 = Replace(.Replace(r.Value2, ""), "–", " ")
Next r
For Each r In Selection
r.Value = Replace(.Replace(r.Value, ""), "‘", " ")
r.Value2 = Replace(.Replace(r.Value2, ""), "
", " ")
Next r
For Each r In Selection
r.Value = Replace(.Replace(r.Value, ""), "
", " ")
r.Value2 = Replace(.Replace(r.Value2, ""), "’s", " ")
Next r
End With
End Sub
Private Sub CommandButton1_Click()
End Sub