How to determine the encoding of text?

前端 未结 10 1488
一向
一向 2020-11-21 07:47

I received some text that is encoded, but I don\'t know what charset was used. Is there a way to determine the encoding of a text file using Python? How can I detect the enc

10条回答
  •  遥遥无期
    2020-11-21 08:15

    # Function: OpenRead(file)
    
    # A text file can be encoded using:
    #   (1) The default operating system code page, Or
    #   (2) utf8 with a BOM header
    #
    #  If a text file is encoded with utf8, and does not have a BOM header,
    #  the user can manually add a BOM header to the text file
    #  using a text editor such as notepad++, and rerun the python script,
    #  otherwise the file is read as a codepage file with the 
    #  invalid codepage characters removed
    
    import sys
    if int(sys.version[0]) != 3:
        print('Aborted: Python 3.x required')
        sys.exit(1)
    
    def bomType(file):
        """
        returns file encoding string for open() function
    
        EXAMPLE:
            bom = bomtype(file)
            open(file, encoding=bom, errors='ignore')
        """
    
        f = open(file, 'rb')
        b = f.read(4)
        f.close()
    
        if (b[0:3] == b'\xef\xbb\xbf'):
            return "utf8"
    
        # Python automatically detects endianess if utf-16 bom is present
        # write endianess generally determined by endianess of CPU
        if ((b[0:2] == b'\xfe\xff') or (b[0:2] == b'\xff\xfe')):
            return "utf16"
    
        if ((b[0:5] == b'\xfe\xff\x00\x00') 
                  or (b[0:5] == b'\x00\x00\xff\xfe')):
            return "utf32"
    
        # If BOM is not provided, then assume its the codepage
        #     used by your operating system
        return "cp1252"
        # For the United States its: cp1252
    
    
    def OpenRead(file):
        bom = bomType(file)
        return open(file, 'r', encoding=bom, errors='ignore')
    
    
    #######################
    # Testing it
    #######################
    fout = open("myfile1.txt", "w", encoding="cp1252")
    fout.write("* hi there (cp1252)")
    fout.close()
    
    fout = open("myfile2.txt", "w", encoding="utf8")
    fout.write("\u2022 hi there (utf8)")
    fout.close()
    
    # this case is still treated like codepage cp1252
    #   (User responsible for making sure that all utf8 files
    #   have a BOM header)
    fout = open("badboy.txt", "wb")
    fout.write(b"hi there.  barf(\x81\x8D\x90\x9D)")
    fout.close()
    
    # Read Example file with Bom Detection
    fin = OpenRead("myfile1.txt")
    L = fin.readline()
    print(L)
    fin.close()
    
    # Read Example file with Bom Detection
    fin = OpenRead("myfile2.txt")
    L =fin.readline() 
    print(L) #requires QtConsole to view, Cmd.exe is cp1252
    fin.close()
    
    # Read CP1252 with a few undefined chars without barfing
    fin = OpenRead("badboy.txt")
    L =fin.readline() 
    print(L)
    fin.close()
    
    # Check that bad characters are still in badboy codepage file
    fin = open("badboy.txt", "rb")
    fin.read(20)
    fin.close()
    

提交回复
热议问题