How to identify doc, docx, pdf, xls and xlsx based on file header in C#? I don\'t want to rely on the file extensions neither MimeMapping.GetMimeMapping for this as either o
The answer from user2173353 is the most correct one, given that the OP specifically mentioned Office file formats. However, I didn't like the idea of adding an entire library (OpenMCDF) just to identify legacy Office formats, so I wrote my own routine for doing just this.
public static CfbFileFormat GetCfbFileFormat(Stream fileData)
{
if (!fileData.CanSeek)
throw new ArgumentException("Data stream must be seekable.", nameof(fileData));
try
{
// Notice that values in a CFB files are always little-endian. Fortunately BinaryReader.ReadUInt16/ReadUInt32 reads with little-endian.
// If using .net < 4.5 this BinaryReader constructor is not available. Use a simpler one but remember to also remove the 'using' statement.
using (BinaryReader reader = new BinaryReader(fileData, Encoding.Unicode, true))
{
// Check that data has the CFB file header
var header = reader.ReadBytes(8);
if (!header.SequenceEqual(new byte[] {0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1}))
return CfbFileFormat.Unknown;
// Get sector size (2 byte uint) at offset 30 in the header
// Value at 1C specifies this as the power of two. The only valid values are 9 or 12, which gives 512 or 4096 byte sector size.
fileData.Position = 30;
ushort readUInt16 = reader.ReadUInt16();
int sectorSize = 1 << readUInt16;
// Get first directory sector index at offset 48 in the header
fileData.Position = 48;
var rootDirectoryIndex = reader.ReadUInt32();
// File header is one sector wide. After that we can address the sector directly using the sector index
var rootDirectoryAddress = sectorSize + (rootDirectoryIndex * sectorSize);
// Object type field is offset 80 bytes into the directory sector. It is a 128 bit GUID, encoded as "DWORD, WORD, WORD, BYTE[8]".
fileData.Position = rootDirectoryAddress + 80;
var bits127_96 = reader.ReadInt32();
var bits95_80 = reader.ReadInt16();
var bits79_64 = reader.ReadInt16();
var bits63_0 = reader.ReadBytes(8);
var guid = new Guid(bits127_96, bits95_80, bits79_64, bits63_0);
// Compare to known file format GUIDs
CfbFileFormat result;
return Formats.TryGetValue(guid, out result) ? result : CfbFileFormat.Unknown;
}
}
catch (IOException)
{
return CfbFileFormat.Unknown;
}
catch (OverflowException)
{
return CfbFileFormat.Unknown;
}
}
public enum CfbFileFormat
{
Doc,
Xls,
Msi,
Ppt,
Unknown
}
private static readonly Dictionary<Guid, CfbFileFormat> Formats = new Dictionary<Guid, CfbFileFormat>
{
{Guid.Parse("{00020810-0000-0000-c000-000000000046}"), CfbFileFormat.Xls},
{Guid.Parse("{00020820-0000-0000-c000-000000000046}"), CfbFileFormat.Xls},
{Guid.Parse("{00020906-0000-0000-c000-000000000046}"), CfbFileFormat.Doc},
{Guid.Parse("{000c1084-0000-0000-c000-000000000046}"), CfbFileFormat.Msi},
{Guid.Parse("{64818d10-4f9b-11cf-86ea-00aa00b929e8}"), CfbFileFormat.Ppt}
};
Additional formats identifiers can be added as needed.
I've tried this on .doc and .xls, and it has worked fine. I haven't tested on CFB files using 4096 byte sector size, as I don't even know where to find those.
The code is based on information from the following documents:
This question contains a example of using the first bytes of a file to determine the file type: Using .NET, how can you find the mime type of a file based on the file signature not the extension
It is a very long post, so I am posting the relevant answer below:
public class MimeType
{
private static readonly byte[] BMP = { 66, 77 };
private static readonly byte[] DOC = { 208, 207, 17, 224, 161, 177, 26, 225 };
private static readonly byte[] EXE_DLL = { 77, 90 };
private static readonly byte[] GIF = { 71, 73, 70, 56 };
private static readonly byte[] ICO = { 0, 0, 1, 0 };
private static readonly byte[] JPG = { 255, 216, 255 };
private static readonly byte[] MP3 = { 255, 251, 48 };
private static readonly byte[] OGG = { 79, 103, 103, 83, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0 };
private static readonly byte[] PDF = { 37, 80, 68, 70, 45, 49, 46 };
private static readonly byte[] PNG = { 137, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82 };
private static readonly byte[] RAR = { 82, 97, 114, 33, 26, 7, 0 };
private static readonly byte[] SWF = { 70, 87, 83 };
private static readonly byte[] TIFF = { 73, 73, 42, 0 };
private static readonly byte[] TORRENT = { 100, 56, 58, 97, 110, 110, 111, 117, 110, 99, 101 };
private static readonly byte[] TTF = { 0, 1, 0, 0, 0 };
private static readonly byte[] WAV_AVI = { 82, 73, 70, 70 };
private static readonly byte[] WMV_WMA = { 48, 38, 178, 117, 142, 102, 207, 17, 166, 217, 0, 170, 0, 98, 206, 108 };
private static readonly byte[] ZIP_DOCX = { 80, 75, 3, 4 };
public static string GetMimeType(byte[] file, string fileName)
{
string mime = "application/octet-stream"; //DEFAULT UNKNOWN MIME TYPE
//Ensure that the filename isn't empty or null
if (string.IsNullOrWhiteSpace(fileName))
{
return mime;
}
//Get the file extension
string extension = Path.GetExtension(fileName) == null
? string.Empty
: Path.GetExtension(fileName).ToUpper();
//Get the MIME Type
if (file.Take(2).SequenceEqual(BMP))
{
mime = "image/bmp";
}
else if (file.Take(8).SequenceEqual(DOC))
{
mime = "application/msword";
}
else if (file.Take(2).SequenceEqual(EXE_DLL))
{
mime = "application/x-msdownload"; //both use same mime type
}
else if (file.Take(4).SequenceEqual(GIF))
{
mime = "image/gif";
}
else if (file.Take(4).SequenceEqual(ICO))
{
mime = "image/x-icon";
}
else if (file.Take(3).SequenceEqual(JPG))
{
mime = "image/jpeg";
}
else if (file.Take(3).SequenceEqual(MP3))
{
mime = "audio/mpeg";
}
else if (file.Take(14).SequenceEqual(OGG))
{
if (extension == ".OGX")
{
mime = "application/ogg";
}
else if (extension == ".OGA")
{
mime = "audio/ogg";
}
else
{
mime = "video/ogg";
}
}
else if (file.Take(7).SequenceEqual(PDF))
{
mime = "application/pdf";
}
else if (file.Take(16).SequenceEqual(PNG))
{
mime = "image/png";
}
else if (file.Take(7).SequenceEqual(RAR))
{
mime = "application/x-rar-compressed";
}
else if (file.Take(3).SequenceEqual(SWF))
{
mime = "application/x-shockwave-flash";
}
else if (file.Take(4).SequenceEqual(TIFF))
{
mime = "image/tiff";
}
else if (file.Take(11).SequenceEqual(TORRENT))
{
mime = "application/x-bittorrent";
}
else if (file.Take(5).SequenceEqual(TTF))
{
mime = "application/x-font-ttf";
}
else if (file.Take(4).SequenceEqual(WAV_AVI))
{
mime = extension == ".AVI" ? "video/x-msvideo" : "audio/x-wav";
}
else if (file.Take(16).SequenceEqual(WMV_WMA))
{
mime = extension == ".WMA" ? "audio/x-ms-wma" : "video/x-ms-wmv";
}
else if (file.Take(4).SequenceEqual(ZIP_DOCX))
{
mime = extension == ".DOCX" ? "application/vnd.openxmlformats-officedocument.wordprocessingml.document" : "application/x-zip-compressed";
}
return mime;
}
Using file signatures it is not so feasible (since the new office formats are ZIP files and the old Office files are OLE CF / OLE SS containers), but you can use C# code to read them and figure out what they are.
For newest Office formats, you can read the (DOCX/PPTX/XLSX/...) ZIP file using System.IO.Packaging
: https://msdn.microsoft.com/en-us/library/ms568187(v=vs.110).aspx
Doing that, you can find the ContentType
of the first document part and infer using that.
For older Office files (Office 2003) you can use this library to distinguish them based on their contents (note that MSI and MSG files are also using this file format): http://sourceforge.net/projects/openmcdf/
E.g., here are the contents of an XLS file:
I hope this helps! :)
It would have certainly helped me, if I had found this answer earlier. ;)
user2173353 has what appears to be the correct solution for detecting the new Office .docx / .xlsx formats. To add some details to this, the below check appears to identify these correctly:
/// <summary>
/// MS .docx, .xslx and other extensions are (correctly) identified as zip files using signature lookup.
/// This tests if System.IO.Packaging is able to open, and if package has parts, this is not a zip file.
/// </summary>
/// <param name="stream"></param>
/// <returns></returns>
private static bool IsPackage(this Stream stream)
{
Package package = Package.Open(stream, FileMode.Open, FileAccess.Read);
return package.GetParts().Any();
}