We receive some files, which have been concatenated by another party. In the middle of these files are some BOM
characters.
Is there a way we can detect these 3 chars and remove them? I've seen plenty of examples about how to remove the BOM
from the -start- of a file ... but not the middle.
Assuming that your file is small enough to hold in memory, and that you have an Enumerable.Replace
extension method for replacing subsequences, then you could use:
var bytes = File.ReadAllBytes(filePath);
var bom = new byte[] { 0xEF, 0xBB, 0xBF };
var empty = Enumerable.Empty<byte>();
bytes = bytes.Replace(bom, empty).ToArray();
File.WriteAllBytes(filePath, bytes);
Here is a simple (inefficient) implementation of the Replace
extension method:
public static IEnumerable<TSource> Replace<TSource>(
this IEnumerable<TSource> source,
IEnumerable<TSource> match,
IEnumerable<TSource> replacement)
{
return Replace(source, match, replacement, EqualityComparer<TSource>.Default);
}
public static IEnumerable<TSource> Replace<TSource>(
this IEnumerable<TSource> source,
IEnumerable<TSource> match,
IEnumerable<TSource> replacement,
IEqualityComparer<TSource> comparer)
{
int sLength = source.Count();
int mLength = match.Count();
if (sLength < mLength || mLength == 0)
return source;
int[] matchIndexes = (
from sIndex in Enumerable.Range(0, sLength - mLength + 1)
where source.Skip(sIndex).Take(mLength).SequenceEqual(match, comparer)
select sIndex
).ToArray();
var result = new List<TSource>();
int sPosition = 0;
foreach (int mPosition in matchIndexes)
{
var sPart = source.Skip(sPosition).Take(mPosition - sPosition);
result.AddRange(sPart);
result.AddRange(replacement);
sPosition = mPosition + mLength;
}
var sLastPart = source.Skip(sPosition).Take(sLength - sPosition);
result.AddRange(sLastPart);
return result;
}
来源:https://stackoverflow.com/questions/25228263/how-can-i-remove-any-utf-8-bom-that-exists-within-some-text-not-at-the-start