Byte order mark screws up file reading in Java

后端 未结 9 2511
说谎
说谎 2020-11-22 02:55

I\'m trying to read CSV files using Java. Some of the files may have a byte order mark in the beginning, but not all. When present, the byte order gets read along with the r

9条回答
  •  长发绾君心
    2020-11-22 03:32

    EDIT: I've made a proper release on GitHub: https://github.com/gpakosz/UnicodeBOMInputStream


    Here is a class I coded a while ago, I just edited the package name before pasting. Nothing special, it is quite similar to solutions posted in SUN's bug database. Incorporate it in your code and you're fine.

    /* ____________________________________________________________________________
     * 
     * File:    UnicodeBOMInputStream.java
     * Author:  Gregory Pakosz.
     * Date:    02 - November - 2005    
     * ____________________________________________________________________________
     */
    package com.stackoverflow.answer;
    
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.PushbackInputStream;
    
    /**
     * The UnicodeBOMInputStream class wraps any
     * InputStream and detects the presence of any Unicode BOM
     * (Byte Order Mark) at its beginning, as defined by
     * RFC 3629 - UTF-8, a transformation format of ISO 10646
     * 
     * 

    The * Unicode FAQ * defines 5 types of BOMs:

      *
    • 00 00 FE FF  = UTF-32, big-endian
    • *
    • FF FE 00 00  = UTF-32, little-endian
    • *
    • FE FF        = UTF-16, big-endian
    • *
    • FF FE        = UTF-16, little-endian
    • *
    • EF BB BF     = UTF-8
    • *

    * *

    Use the {@link #getBOM()} method to know whether a BOM has been detected * or not. *

    *

    Use the {@link #skipBOM()} method to remove the detected BOM from the * wrapped InputStream object.

    */ public class UnicodeBOMInputStream extends InputStream { /** * Type safe enumeration class that describes the different types of Unicode * BOMs. */ public static final class BOM { /** * NONE. */ public static final BOM NONE = new BOM(new byte[]{},"NONE"); /** * UTF-8 BOM (EF BB BF). */ public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF, (byte)0xBB, (byte)0xBF}, "UTF-8"); /** * UTF-16, little-endian (FF FE). */ public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF, (byte)0xFE}, "UTF-16 little-endian"); /** * UTF-16, big-endian (FE FF). */ public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE, (byte)0xFF}, "UTF-16 big-endian"); /** * UTF-32, little-endian (FF FE 00 00). */ public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF, (byte)0xFE, (byte)0x00, (byte)0x00}, "UTF-32 little-endian"); /** * UTF-32, big-endian (00 00 FE FF). */ public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00, (byte)0x00, (byte)0xFE, (byte)0xFF}, "UTF-32 big-endian"); /** * Returns a String representation of this BOM * value. */ public final String toString() { return description; } /** * Returns the bytes corresponding to this BOM value. */ public final byte[] getBytes() { final int length = bytes.length; final byte[] result = new byte[length]; // Make a defensive copy System.arraycopy(bytes,0,result,0,length); return result; } private BOM(final byte bom[], final String description) { assert(bom != null) : "invalid BOM: null is not allowed"; assert(description != null) : "invalid description: null is not allowed"; assert(description.length() != 0) : "invalid description: empty string is not allowed"; this.bytes = bom; this.description = description; } final byte bytes[]; private final String description; } // BOM /** * Constructs a new UnicodeBOMInputStream that wraps the * specified InputStream. * * @param inputStream an InputStream. * * @throws NullPointerException when inputStream is * null. * @throws IOException on reading from the specified InputStream * when trying to detect the Unicode BOM. */ public UnicodeBOMInputStream(final InputStream inputStream) throws NullPointerException, IOException { if (inputStream == null) throw new NullPointerException("invalid input stream: null is not allowed"); in = new PushbackInputStream(inputStream,4); final byte bom[] = new byte[4]; final int read = in.read(bom); switch(read) { case 4: if ((bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) && (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00)) { this.bom = BOM.UTF_32_LE; break; } else if ((bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) && (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF)) { this.bom = BOM.UTF_32_BE; break; } case 3: if ((bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) && (bom[2] == (byte)0xBF)) { this.bom = BOM.UTF_8; break; } case 2: if ((bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE)) { this.bom = BOM.UTF_16_LE; break; } else if ((bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF)) { this.bom = BOM.UTF_16_BE; break; } default: this.bom = BOM.NONE; break; } if (read > 0) in.unread(bom,0,read); } /** * Returns the BOM that was detected in the wrapped * InputStream object. * * @return a BOM value. */ public final BOM getBOM() { // BOM type is immutable. return bom; } /** * Skips the BOM that was found in the wrapped * InputStream object. * * @return this UnicodeBOMInputStream. * * @throws IOException when trying to skip the BOM from the wrapped * InputStream object. */ public final synchronized UnicodeBOMInputStream skipBOM() throws IOException { if (!skipped) { in.skip(bom.bytes.length); skipped = true; } return this; } /** * {@inheritDoc} */ public int read() throws IOException { return in.read(); } /** * {@inheritDoc} */ public int read(final byte b[]) throws IOException, NullPointerException { return in.read(b,0,b.length); } /** * {@inheritDoc} */ public int read(final byte b[], final int off, final int len) throws IOException, NullPointerException { return in.read(b,off,len); } /** * {@inheritDoc} */ public long skip(final long n) throws IOException { return in.skip(n); } /** * {@inheritDoc} */ public int available() throws IOException { return in.available(); } /** * {@inheritDoc} */ public void close() throws IOException { in.close(); } /** * {@inheritDoc} */ public synchronized void mark(final int readlimit) { in.mark(readlimit); } /** * {@inheritDoc} */ public synchronized void reset() throws IOException { in.reset(); } /** * {@inheritDoc} */ public boolean markSupported() { return in.markSupported(); } private final PushbackInputStream in; private final BOM bom; private boolean skipped = false; } // UnicodeBOMInputStream

    And you're using it this way:

    import java.io.BufferedReader;
    import java.io.FileInputStream;
    import java.io.InputStreamReader;
    
    public final class UnicodeBOMInputStreamUsage
    {
      public static void main(final String[] args) throws Exception
      {
        FileInputStream fis = new FileInputStream("test/offending_bom.txt");
        UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(fis);
    
        System.out.println("detected BOM: " + ubis.getBOM());
    
        System.out.print("Reading the content of the file without skipping the BOM: ");
        InputStreamReader isr = new InputStreamReader(ubis);
        BufferedReader br = new BufferedReader(isr);
    
        System.out.println(br.readLine());
    
        br.close();
        isr.close();
        ubis.close();
        fis.close();
    
        fis = new FileInputStream("test/offending_bom.txt");
        ubis = new UnicodeBOMInputStream(fis);
        isr = new InputStreamReader(ubis);
        br = new BufferedReader(isr);
    
        ubis.skipBOM();
    
        System.out.print("Reading the content of the file after skipping the BOM: ");
        System.out.println(br.readLine());
    
        br.close();
        isr.close();
        ubis.close();
        fis.close();
      }
    
    } // UnicodeBOMInputStreamUsage
    

提交回复
热议问题