I\'m trying to read CSV files using Java. Some of the files may have a byte order mark in the beginning, but not all. When present, the byte order gets read along with the r
NotePad++ is a good tool to convert UTF-8 encoding to UTF-8(BOM) encoding.
https://notepad-plus-plus.org/downloads/
UTF8BOMTester.java
public class UTF8BOMTester {
public static void main(String[] args) throws FileNotFoundException, IOException {
// TODO Auto-generated method stub
File file = new File("test.txt");
boolean same = UTF8BOMInputStream.isSameEncodingType(file);
System.out.println(same);
if (same) {
UTF8BOMInputStream is = new UTF8BOMInputStream(file);
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
System.out.println(br.readLine());
}
}
static void bytesPrint(byte[] b) {
for (byte a : b)
System.out.printf("%x ", a);
}}
UTF8BOMInputStream.java
public class UTF8BOMInputStream extends InputStream {
byte[] SYMBLE_BOM = { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF };
FileInputStream fis;
final boolean isSameEncodingType;
public UTF8BOMInputStream(File file) throws IOException {
FileInputStream fis=new FileInputStream(file);
byte[] symble=new byte[3];
fis.read(symble);
bytesPrint(symble);
isSameEncodingType=isSameEncodingType(symble);
if(isSameEncodingType)
this.fis=fis;
else
this.fis=null;
}
@Override
public int read() throws IOException {
return fis.read();
}
void bytesPrint(byte[] b) {
for (byte a : b)
System.out.printf("%x ", a);
}
boolean bytesCompare(byte[] a, byte[] b) {
if (a.length != b.length)
return false;
for (int i = 0; i < a.length; i++) {
if (a[i] != b[i])
return false;
}
return true;
}
boolean isSameEncodingType(byte[] symble) {
return bytesCompare(symble,SYMBLE_BOM);
}
public static boolean isSameEncodingType(File file) throws IOException {
return (new UTF8BOMInputStream(file)).isSameEncodingType;
}