问题
In a C array/string, How to i correctly detect if something is at the start of a file if the file has a BOM as sometimes the BOM takes up 1 character, other times the BOM takes up 3 characters, and other times the BOM is not present, resulting in the actual location of x to not always start on index 0
Most of the time it is this (in hex) "ef bb bf" For example:
ef bb bf 23 21 2f 62 69 6e 2f 62 61 73 68 0a 61 20 26 26 20 62 0a 67 20 : ...#!/bin/bash.a && b.g
Would it be something like this?
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
struct BOM {
int is_BOM;
int length;
int type;
char * type_as_string;
char * BOM;
}
int matches(char * BOM_, char * string_, int length_) {
char * b = BOM_+1;
for(int i = 0; i < length_; i++) {
if (string_[i] == b[i]) matches = 1;
else {
matches = 0;
break;
}
}
return matches;
}
#define ifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_) if (matches(BOM_, string_, length_)) { \
bom_struct.is_BOM = is_BOM_; \
bom_struct.length = length_; \
bom_struct.type = type_; \
bom_struct.type_as_string = type_as_string_; \
bom_struct.BOM = BOM_+1 /* remove the ^ at the start */ ; \
}
#define elifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_) else ifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_)
#define elbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_) else { \
bom_struct.is_BOM = is_BOM_; \
bom_struct.length = length_; \
bom_struct.type = type_; \
bom_struct.type_as_string = type_as_string_; \
bom_struct.BOM = BOM_; \
}
#define cat 0
#define hex 1
#define both 2
#define json 3
int mode;
void __hexdump(unsigned char *buffer, unsigned long index, unsigned long width)
{
unsigned long i;
if (mode == both || mode == hex) {
for (i = 0; i < index; i++)
printf("%02x ", buffer[i]);
}
if (mode == both) {
for (unsigned long spacer = index; spacer < width; spacer++)
printf("\t");
printf(": ");
}
if (mode == cat || mode == both || mode == json) {
for (i = 0; i < index; i++)
{
if (buffer[i] < 32 || buffer[i] >= 127)
printf(".");
else
printf("%c", buffer[i]);
}
}
printf("\n");
}
int __hexdump_string(char *infile, unsigned long start, unsigned long stop, unsigned long width)
{
char ch;
unsigned long f_index = 0;
unsigned long bb_index = 0;
unsigned char *byte_buffer = malloc(width);
if (byte_buffer == NULL)
{
printf("Could not allocate memory for byte_buffer\n");
return -1;
}
while (*infile)
{
ch = *infile;
if ((f_index >= start) && (f_index <= stop))
{
byte_buffer[bb_index] = ch;
bb_index++;
}
if (bb_index >= width)
{
__hexdump(byte_buffer, bb_index, width);
bb_index = 0;
}
f_index++;
infile++;
}
if (bb_index)
__hexdump(byte_buffer, bb_index, width);
free(byte_buffer);
return 0;
}
#define builtin__BOM_print(bom_struct) { \
printf("%s.is_BOM = %s\n%s.length = %d\n%s.type = %d\n%s.type_as_string = %s\n%s.BOM = ", #bom_struct, bom_struct.is_BOM?"yes":"no", #bom_struct, bom_struct.length, #bom_struct, bom_struct.type, #bom_struct,bom_struct.type_as_string, #bom_struct); \
mode = both; \
__hexdump_string(bom_struct.BOM, 0, bom_struct.length, 5); \
}
struct BOM builtin__BOM_get(char * string) {
struct BOM bom;
ifbom(bom, true, 3, 1, "UTF-8", "^\xef\xbb\xbf", string)
elifbom(bom, true, 2, 2, "UTF-16 (BE)", "^\xfe\xff", string)
elifbom(bom, true, 2, 3, "UTF-16 (LE)", "^\xff\xfe", string)
elifbom(bom, true, 4, 4, "UTF-32 (BE)", "^\x00\x00\xfe\xff", string)
elifbom(bom, true, 4, 5, "UTF-32 (LE)", "^\xff\xfe\x00\x00", string)
elifbom(bom, true, 5, 6, "UTF-7", "^\x2b\x2f\x76\x38\x3d", string)
elifbom(bom, true, 4, 7, "UTF-7", "^\x2b\x2f\x76\x38", string)
elifbom(bom, true, 4, 8, "UTF-7", "^\x2b\x2f\x76\x39", string)
elifbom(bom, true, 4, 9, "UTF-7", "^\x2b\x2f\x76\x2b", string)
elifbom(bom, true, 4, 10, "UTF-7", "^\x2b\x2f\x76\x2f", string)
elifbom(bom, true, 3, 11, "UTF-1", "^\xf7\x64\x4c", string)
elifbom(bom, true, 4, 12, "UTF-EBCDIC", "^\xdd\x73\x66\x73", string)
elifbom(bom, true, 3, 13, "SCSU", "^\x0e\xfe\xff", string)
elifbom(bom, true, 3, 14, "BOCU-1", "^\xfb\xee\x28", string)
elifbom(bom, true, 4, 15, "GB-18030", "^\x84\x31\x95\x33", string)
elbom(bom, false, 0, 0, "Not present", "Not present")
return (struct BOM) bom;
}
int main()
{
struct BOM t = builtin__BOM_get("test");
builtin__BOM_print(t);
return 0;
}
回答1:
You should read the first characters to know if BOM is present or not.
- if 4 first chars are FF FE 00 00 : little endian UTF-32
- else if 2 first chars are FF FE : little endian UTF-16
- else if 4 first chars are 00 00 FE FF : big endian UTF-32
- else if 2 first chars are FE FF : big endian UTF-16
- else if 3 firsts chars are EF BB BF : UTF-8
- etc...
Depending on the BOM length, you know at which index the real file data starts.
You can find a more complete list of BOM on wikipedia page : https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding
来源:https://stackoverflow.com/questions/51518244/c-how-to-skip-bom-when-checking-if-x-is-at-the-start-of-a-file