I need to decide when (not) to convert a text file based on the known file encoding and the desired output encoding.
If the text is US-ASCII, I don\'t need to conver
An alternative approach is to decode the bytes 0x00 - 0x7F in the given encoding, and check that the characters match ASCII. For example, in Python 3.x:
def is_ascii_superset(encoding):
for codepoint in range(128):
if bytes([codepoint]).decode(encoding, 'ignore') != chr(codepoint):
return False
return True
This gives:
>>> is_ascii_superset('US-ASCII')
True
>>> is_ascii_superset('windows-1252')
True
>>> is_ascii_superset('ISO-8859-15')
True
>>> is_ascii_superset('UTF-8')
True
>>> is_ascii_superset('UTF-16')
False
>>> is_ascii_superset('IBM500') # a variant of EBCDIC
False
EDIT: Get US-ASCII compatibility for each encoding supported by your Qt version in C++:
#include <QTextCodec>
#include <QMap>
typedef enum
{
eQtCodecUndefined,
eQtCodecAsciiIncompatible,
eQtCodecAsciiCompatible,
} tQtCodecType;
QMap<QByteArray, tQtCodecType> QtCodecTypes()
{
QMap<QByteArray, tQtCodecType> CodecTypes;
// How to test Qt's interpretation of ASCII data?
QList<QByteArray> available = QTextCodec::availableCodecs();
QTextCodec *referenceCodec = QTextCodec::codecForName("UTF-8"); // because Qt has no US-ASCII, but we only test bytes 0-127 and UTF-8 is a superset of US-ASCII
if(referenceCodec == 0)
{
qDebug("Unable to get reference codec 'UTF-8'");
return CodecTypes;
}
for(int i = 0; i < available.count(); i++)
{
const QByteArray name = available.at(i);
QTextCodec *currCodec = QTextCodec::codecForName(name);
if(currCodec == NULL)
{
qDebug("Unable to get codec for '%s'", qPrintable(QString(name)));
CodecTypes.insert(name, eQtCodecUndefined);
continue;
}
tQtCodecType type = eQtCodecAsciiCompatible;
for(uchar j = 0; j < 128; j++) // UTF-8 == US-ASCII in the lower 7 bit
{
const char c = (char)j; // character to test < 2^8
QString sRef, sTest;
sRef = referenceCodec->toUnicode(&c, 1); // convert character to UTF-16 (QString internal) assuming it is ASCII (via UTF-8)
sTest = currCodec->toUnicode(&c, 1); // convert character to UTF-16 assuming it is of type [currCodec]
if(sRef != sTest) // compare both UTF-16 representations -> if they are equal, these codecs are transparent for Qt
{
type = eQtCodecAsciiIncompatible;
break;
}
}
CodecTypes.insert(name, type);
}
return CodecTypes;
}