问题
For a C# web application I am wanting to index text from PDF, DOC, etc files stored in a database.
I have been experimenting with an IFilter example on Code Project which works great for files from the file system, but my files are stored in a MS-SQL database.
Can anyone help me locate a sample to extract text from files stored in a database or have an idea on how to modify the Code Project code to work with a database instead of the file system?
回答1:
Finally after many hours I figured out how to make this work! I needed to run IFilter on PDF content stored in a database and I wanted to avoid saving the data to temporary files.
First I tried to use the BindIFilterFromStream API to create an IFilter for content stored in a Stream, but it seems that it doesn't work properly (at least not for this scenario). So don't go that way.
Instead you need to create an IFilter for a file extension (or access it some other way). Then you can access the IPersistStream COM interface and use it to load the PDF content into the IFilter. The rest works the same as for files. However, note that the IPersistStream API may not be implemented by every IFilter. It works for the Adobe PDF IFilter though.
The code should look like this (I removed some return code checking to make the code more readable, however, you should check all possible return codes).
private string ParseIFilter(Stream s)
{
// Get an IFilter for a file or file extension
IFilter filter = null;
FilterReturnCodes result = NativeMethods.LoadIFilter(".pdf", null, ref filter);
if (result != FilterReturnCodes.S_OK)
{
Marshal.ThrowExceptionForHR((int)result);
}
// Copy the content to global memory
byte[] buffer = new byte[s.Length];
s.Read(buffer, 0, buffer.Length);
IntPtr nativePtr = Marshal.AllocHGlobal(buffer.Length);
Marshal.Copy(buffer, 0, nativePtr, buffer.Length);
// Create a COM stream
System.Runtime.InteropServices.ComTypes.IStream comStream;
NativeMethods.CreateStreamOnHGlobal(nativePtr, true, out comStream);
// Load the contents to the iFilter using IPersistStream interface
var persistStream = (IPersistStream)filter;
persistStream.Load(comStream);
// Initialize iFilter
FilterFlags filterFlags;
FilterReturnCodes result = filter.Init(
FilterInit.IFILTER_INIT_INDEXING_ONLY, 0, IntPtr.Zero, out filterFlags);
return ExtractTextFromIFilter(filter);
}
Text extraction from the filter looks like this in my code. There are many examples of this on the web and it can be implemented in many ways depending on what you need.
private string ExtractTextFromIFilter(IFilter filter)
{
var sb = new StringBuilder();
while (true)
{
StatChunk chunk;
result = filter.GetChunk(out chunk);
if (result == FilterReturnCodes.S_OK)
{
if (chunk.flags == ChunkState.CHUNK_TEXT)
{
sb.Append(ExtractTextFromChunk(filter, chunk));
}
continue;
}
if (result == FilterReturnCodes.FILTER_E_END_OF_CHUNKS)
{
return sb.ToString();
}
Marshal.ThrowExceptionForHR((int)result);
}
}
private virtual string ExtractTextFromChunk(IFilter filter, StatChunk chunk)
{
var sb = new StringBuilder();
var result = FilterReturnCodes.S_OK;
while (result == FilterReturnCodes.S_OK)
{
int sizeBuffer = 16384;
var buffer = new StringBuilder(sizeBuffer);
result = filter.GetText(ref sizeBuffer, buffer);
if ((result == FilterReturnCodes.S_OK) || (result == FilterReturnCodes.FILTER_S_LAST_TEXT))
{
if((sizeBuffer > 0) && (buffer.Length > 0))
{
sb.Append(buffer.ToString(0, sizeBuffer));
}
}
if (result == FilterReturnCodes.FILTER_E_NO_TEXT)
{
return string.Empty;
}
if ((result == FilterReturnCodes.FILTER_S_LAST_TEXT) || (result == FilterReturnCodes.FILTER_E_NO_MORE_TEXT))
{
return sb.ToString();
}
}
return sb.ToString();
}
And here are the definitions of native methods and the structures used by them.
internal static class NativeMethods
{
[DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
public static extern FilterReturnCodes LoadIFilter(
string pwcsPath,
[MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter,
ref IFilter ppIUnk);
[DllImport("ole32.dll")]
public static extern int CreateStreamOnHGlobal(IntPtr hGlobal, bool fDeleteOnRelease, out IStream ppstm);
}
[ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IFilter
{
[PreserveSig]
FilterReturnCodes Init(FilterInit grfFlags, int cAttributes, IntPtr aAttributes, out FilterFlags pdwFlags);
[PreserveSig]
FilterReturnCodes GetChunk(out StatChunk pStat);
[PreserveSig]
FilterReturnCodes GetText(
ref int pcwcBuffer,
[Out, MarshalAs(UnmanagedType.LPWStr)] StringBuilder awcBuffer);
[PreserveSig]
FilterReturnCodes GetValue(ref IntPtr propVal);
[PreserveSig]
FilterReturnCodes BindRegion(ref FilterRegion origPos, ref Guid riid, ref object ppunk);
}
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("0000010c-0000-0000-C000-000000000046")]
public interface IPersist
{
void GetClassID(out Guid pClassID);
}
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("00000109-0000-0000-C000-000000000046")]
public interface IPersistStream : IPersist
{
new void GetClassID(out Guid pClassID);
[PreserveSig]
int IsDirty();
void Load([In] IStream pStm);
void Save(
[In] IStream pStm,
[In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);
void GetSizeMax(out long pcbSize);
}
public struct StatChunk
{
public int idChunk;
[MarshalAs(UnmanagedType.U4)]
public ChunkBreaktype breakType;
[MarshalAs(UnmanagedType.U4)]
public ChunkState flags;
public int locale;
public FullPropSpec attribute;
public int idChunkSource;
public int cwcStartSource;
public int cwcLenSource;
}
public enum ChunkBreaktype
{
CHUNK_NO_BREAK = 0,
CHUNK_EOW = 1,
CHUNK_EOS = 2,
CHUNK_EOP = 3,
CHUNK_EOC = 4
}
public enum ChunkState
{
CHUNK_TEXT = 0x1,
CHUNK_VALUE = 0x2,
CHUNK_FILTER_OWNED_VALUE = 0x4
}
[Flags]
public enum FilterFlags
{
IFILTER_FLAGS_OLE_PROPERTIES = 1
}
[Flags]
public enum FilterInit
{
IFILTER_INIT_CANON_PARAGRAPHS = 1,
IFILTER_INIT_HARD_LINE_BREAKS = 2,
IFILTER_INIT_CANON_HYPHENS = 4,
IFILTER_INIT_CANON_SPACES = 8,
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16,
IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256,
IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32,
IFILTER_INIT_INDEXING_ONLY = 64,
IFILTER_INIT_SEARCH_LINKS = 128,
IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512
}
public struct FilterRegion
{
public int idChunk;
public int cwcStart;
public int cwcExtent;
}
public enum FilterReturnCodes : uint
{
S_OK = 0,
E_ACCESSDENIED = 0x80070005,
E_HANDLE = 0x80070006,
E_INVALIDARG = 0x80070057,
E_OUTOFMEMORY = 0x8007000E,
E_NOTIMPL = 0x80004001,
E_FAIL = 0x80000008,
FILTER_E_PASSWORD = 0x8004170B,
FILTER_E_UNKNOWNFORMAT = 0x8004170C,
FILTER_E_NO_TEXT = 0x80041705,
FILTER_E_NO_VALUES = 0x80041706,
FILTER_E_END_OF_CHUNKS = 0x80041700,
FILTER_E_NO_MORE_TEXT = 0x80041701,
FILTER_E_NO_MORE_VALUES = 0x80041702,
FILTER_E_ACCESS = 0x80041703,
FILTER_W_MONIKER_CLIPPED = 0x00041704,
FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
FILTER_E_LINK_UNAVAILABLE = 0x80041708,
FILTER_S_LAST_TEXT = 0x00041709,
FILTER_S_LAST_VALUES = 0x0004170A
}
public struct FullPropSpec
{
public Guid guidPropSet;
public PropSpec psProperty;
}
[StructLayout(LayoutKind.Explicit)]
public struct PropSpec
{
[FieldOffset(0)]
public int ulKind;
[FieldOffset(4)]
public int propid;
[FieldOffset(4)]
public IntPtr lpwstr;
}
回答2:
I have worked in the past on providing an iFilter intended to provide any search/indexing tool access to text contents inside an AutoCad dwg file. You can read some of my adventure here: http://blogs.msdn.com/b/ifilter/archive/2006/12/25/chronicles-of-an-ifilter-development-inception-to-deployment.aspx
The code you are referring to is old, but still valid. However, there are now more interfaces in use beside GetTextFromFile. You will need to use the stream reader, read up in IPersistStream in the link I mentioned above. If I understand what you want to do, you'll need to open the file as a stream from the database and present this stream to the search/indexer or the iFilter of your choice.
Good luck, Marco
回答3:
I was hoping to do the same thing, but I ended up adding another column to the database table for the TextContent. I saved the BinaryContent to a temporary file, used the CodeProject library Epocalisde.IFilter dll to find the Text, and added that to the TextContent column.
回答4:
Building on Mareks example, here's my take which uses an implementation of the IStream interface instead of allocating memory through Marshal.AllocHGlobal
to create a COM stream.
It works with the Adobe PDF iFilter 64 11.0.01 and a ton of formats such as .doc
, .docx
, .html
, .odt
, .rtf
, the list goes on.
Complete example:
using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
namespace TextExtraction
{
class Program
{
static void Main(string[] args)
{
var file = new FileInfo(@"C:\Path\To\Some.doc");
using (var stream = file.OpenRead())
{
var filter = Load(stream, file.Extension);
if (filter != null)
{
var text = GetText(filter);
Console.WriteLine(text);
}
}
Console.WriteLine("Press your favorite key to exit");
Console.ReadKey();
}
private static IFilter Load(Stream stream, string extension)
{
IFilter filter = null;
if (NativeMethods.LoadIFilter(extension, null, ref filter) == HRESULT.S_OK)
{
if (filter is IPersistStream persistStream)
{
persistStream.Load(new ManagedStream(stream));
if (filter.Init(IFILTER_INIT.IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, 0, IntPtr.Zero, out IFILTER_FLAGS filterFlags) == IFilterReturnCodes.S_OK)
{
return filter;
}
}
}
return null;
}
private static string GetText(IFilter filter)
{
var text = new StringBuilder();
while (filter.GetChunk(out var chunk) == IFilterReturnCodes.S_OK)
{
ReadChunk(filter, chunk, text);
}
return text.ToString();
}
private static void ReadChunk(IFilter filter, STAT_CHUNK chunk, StringBuilder text)
{
var textResult = IFilterReturnCodes.S_OK;
while (textResult == IFilterReturnCodes.S_OK)
{
var bufferSize = 4096U;
var buffer = new char[bufferSize];
textResult = filter.GetText(ref bufferSize, buffer);
if ((textResult == IFilterReturnCodes.S_OK || textResult == IFilterReturnCodes.FILTER_S_LAST_TEXT) && bufferSize > 0)
{
if (chunk.breakType == CHUNK_BREAKTYPE.CHUNK_EOP)
{
text.Append('\n');
}
text.Append(buffer, 0, (int) bufferSize);
}
}
}
[Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IFilter
{
[PreserveSig]
IFilterReturnCodes Init(IFILTER_INIT grfFlags, int cAttributes, IntPtr aAttributes,
out IFILTER_FLAGS pdwFlags);
[PreserveSig]
IFilterReturnCodes GetChunk(out STAT_CHUNK pStat);
[PreserveSig]
IFilterReturnCodes GetText(ref uint pcwcBuffer, [Out, MarshalAs(UnmanagedType.LPArray)]
char[] awcBuffer);
[PreserveSig]
IFilterReturnCodes GetValue(ref IntPtr propVal);
[PreserveSig]
IFilterReturnCodes BindRegion(ref FILTERREGION origPos, ref Guid riid, ref object ppunk);
}
[Guid("0000010C-0000-0000-C000-000000000046")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersist
{
void GetClassID(out Guid pClassID);
}
[Guid("00000109-0000-0000-C000-000000000046")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersistStream : IPersist
{
new void GetClassID(out Guid pClassID);
[PreserveSig]
int IsDirty();
void Load([In] IStream pStm);
void Save([In] IStream pStm, [In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);
void GetSizeMax(out long pcbSize);
}
[Guid("0000000C-0000-0000-C000-000000000046")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IStream
{
[PreserveSig]
HRESULT Read([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)] [Out]
byte[] pv, int cb, IntPtr pcbRead);
[PreserveSig]
HRESULT Write([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)]
byte[] pv, int cb, IntPtr pcbWritten);
[PreserveSig]
HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition);
[PreserveSig]
HRESULT SetSize(long libNewSize);
HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten);
[PreserveSig]
HRESULT Commit(int grfCommitFlags);
[PreserveSig]
HRESULT Revert();
[PreserveSig]
HRESULT LockRegion(long libOffset, long cb, int dwLockType);
[PreserveSig]
HRESULT UnlockRegion(long libOffset, long cb, int dwLockType);
[PreserveSig]
HRESULT Stat(out STATSTG pstatstg, int grfStatFlag);
[PreserveSig]
HRESULT Clone(out IStream ppstm);
}
public class ManagedStream : IStream
{
private readonly Stream _stream;
public ManagedStream(Stream stream)
{
_stream = stream ?? throw new ArgumentNullException(nameof(stream));
}
public HRESULT Clone(out IStream ppstm)
{
ppstm = null;
return HRESULT.E_NOTIMPL;
}
public HRESULT Commit(int grfCommitFlags)
{
return HRESULT.E_NOTIMPL;
}
public HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten)
{
return HRESULT.E_NOTIMPL;
}
public HRESULT LockRegion(long libOffset, long cb, int dwLockType)
{
return HRESULT.E_NOTIMPL;
}
public HRESULT Read(byte[] pv, int cb, IntPtr pcbRead)
{
var bytesRead = _stream.Read(pv, 0, cb);
if (pcbRead != IntPtr.Zero)
{
Marshal.WriteInt32(pcbRead, bytesRead);
}
return HRESULT.S_OK;
}
public HRESULT Revert()
{
return HRESULT.E_NOTIMPL;
}
public HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition)
{
SeekOrigin seekOrigin;
switch (dwOrigin)
{
case (int) STREAM_SEEK.STREAM_SEEK_SET:
seekOrigin = SeekOrigin.Begin;
break;
case (int) STREAM_SEEK.STREAM_SEEK_CUR:
seekOrigin = SeekOrigin.Current;
break;
case (int) STREAM_SEEK.STREAM_SEEK_END:
seekOrigin = SeekOrigin.End;
break;
default:
return HRESULT.E_FAIL;
}
var position = _stream.Seek(dlibMove, seekOrigin);
if (plibNewPosition != IntPtr.Zero)
{
Marshal.WriteInt64(plibNewPosition, position);
}
return HRESULT.S_OK;
}
public HRESULT SetSize(long libNewSize)
{
return HRESULT.E_NOTIMPL;
}
public HRESULT Stat(out STATSTG pstatstg, int grfStatFlag)
{
pstatstg = new STATSTG
{
type = (int) STGTY.STGTY_STREAM,
cbSize = _stream.Length,
grfMode = (int) STGM.STGM_READ
};
if (_stream.CanRead && _stream.CanWrite)
{
pstatstg.grfMode |= (int) STGM.STGM_READWRITE;
}
else if (_stream.CanRead)
{
pstatstg.grfMode |= (int) STGM.STGM_READ;
}
else if (_stream.CanWrite)
{
pstatstg.grfMode |= (int) STGM.STGM_WRITE;
}
else
{
return HRESULT.E_ACCESSDENIED;
}
return HRESULT.S_OK;
}
public HRESULT UnlockRegion(long libOffset, long cb, int dwLockType)
{
return HRESULT.E_NOTIMPL;
}
public HRESULT Write(byte[] pv, int cb, IntPtr pcbWritten)
{
return HRESULT.E_NOTIMPL;
}
}
public class NativeMethods
{
[DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
public static extern HRESULT LoadIFilter(string pwcsPath, [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter, ref IFilter ppIUnk);
}
public struct FILETIME
{
public uint DateTimeLow;
public uint DateTimeHigh;
}
[StructLayout(LayoutKind.Sequential)]
public struct FILTERREGION
{
public ulong idChunk;
public ulong cwcStart;
public ulong cwcExtent;
}
[StructLayout(LayoutKind.Sequential)]
public struct FULLPROPSPEC
{
public Guid guidPropSet;
public PROPSPEC psProperty;
}
[StructLayout(LayoutKind.Explicit)]
public struct PROPSPEC
{
[FieldOffset(0)]
public PROPSPECKIND ulKind;
[FieldOffset(4)]
public uint propid;
[FieldOffset(4)]
public IntPtr lpwstr;
}
public struct STAT_CHUNK
{
public int idChunk;
[MarshalAs(UnmanagedType.U4)]
public CHUNK_BREAKTYPE breakType;
[MarshalAs(UnmanagedType.U4)]
public CHUNKSTATE flags;
public int locale;
public FULLPROPSPEC attribute;
public int idChunkSource;
public int cwcStartSource;
public int cwcLenSource;
}
public struct STATSTG
{
[MarshalAs(UnmanagedType.LPTStr)]
public string pwcsName;
public int type;
public long cbSize;
public FILETIME mtime;
public FILETIME ctime;
public FILETIME atime;
public int grfMode;
public int grfLocksSupported;
public Guid clsid;
public int grfStateBits;
public int reserved;
}
[Flags]
public enum IFilterReturnCodes : uint
{
S_OK = 0,
E_ACCESSDENIED = 0x80070005,
E_HANDLE = 0x80070006,
E_INVALIDARG = 0x80070057,
E_OUTOFMEMORY = 0x8007000E,
E_NOTIMPL = 0x80004001,
E_FAIL = 0x80000008,
FILTER_E_PASSWORD = 0x8004170B,
FILTER_E_UNKNOWNFORMAT = 0x8004170C,
FILTER_E_NO_TEXT = 0x80041705,
FILTER_E_NO_VALUES = 0x80041706,
FILTER_E_END_OF_CHUNKS = 0x80041700,
FILTER_E_NO_MORE_TEXT = 0x80041701,
FILTER_E_NO_MORE_VALUES = 0x80041702,
FILTER_E_ACCESS = 0x80041703,
FILTER_W_MONIKER_CLIPPED = 0x00041704,
FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
FILTER_E_LINK_UNAVAILABLE = 0x80041708,
FILTER_S_LAST_TEXT = 0x00041709,
FILTER_S_LAST_VALUES = 0x0004170A
}
[Flags]
public enum CHUNK_BREAKTYPE : uint
{
CHUNK_NO_BREAK = 0,
CHUNK_EOW = 1,
CHUNK_EOS = 2,
CHUNK_EOP = 3,
CHUNK_EOC = 4
}
[Flags]
public enum CHUNKSTATE : uint
{
CHUNK_TEXT = 0x1,
CHUNK_VALUE = 0x2,
CHUNK_FILTER_OWNED_VALUE = 0x4
}
[Flags]
public enum HRESULT : uint
{
S_OK = 0x00000000,
E_NOTIMPL = 0x80004001,
E_NOINTERFACE = 0x80004002,
E_POINTER = 0x80004003,
E_ABORT = 0x80004004,
E_FAIL = 0x80004005,
E_UNEXPECTED = 0x8000FFFF,
E_ACCESSDENIED = 0x80070005,
E_HANDLE = 0x80070006,
E_OUTOFMEMORY = 0x8007000E,
E_INVALIDARG = 0x80070057
}
[Flags]
public enum IFILTER_FLAGS
{
IFILTER_FLAGS_OLE_PROPERTIES = 1
}
[Flags]
public enum IFILTER_INIT
{
IFILTER_INIT_CANON_PARAGRAPHS = 1,
IFILTER_INIT_HARD_LINE_BREAKS = 2,
IFILTER_INIT_CANON_HYPHENS = 4,
IFILTER_INIT_CANON_SPACES = 8,
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16,
IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256,
IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32,
IFILTER_INIT_INDEXING_ONLY = 64,
IFILTER_INIT_SEARCH_LINKS = 128,
IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512,
IFILTER_INIT_FILTER_AGGRESSIVE_BREAK = 1024,
IFILTER_INIT_DISABLED_EMBEDDED = 2048,
IFILTER_INIT_EMIT_FORMATTING = 4096
}
[Flags]
public enum PROPSPECKIND : ulong
{
PRSPEC_LPWSTR = 0,
PRSPEC_PROPID = 1
}
[Flags]
public enum STGM : ulong
{
STGM_READ = 0x00000000L,
STGM_WRITE = 0x00000001L,
STGM_READWRITE = 0x00000002L,
STGM_SHARE_DENY_NONE = 0x00000040L,
STGM_SHARE_DENY_READ = 0x00000030L,
STGM_SHARE_DENY_WRITE = 0x00000020L,
STGM_SHARE_EXCLUSIVE = 0x00000010L,
STGM_PRIORITY = 0x00040000L,
STGM_CREATE = 0x00001000L,
STGM_CONVERT = 0x00020000L,
STGM_FAILIFTHERE = 0x00000000L,
STGM_DIRECT = 0x00000000L,
STGM_TRANSACTED = 0x00010000L,
STGM_NOSCRATCH = 0x00100000L,
STGM_NOSNAPSHOT = 0x00200000L,
STGM_SIMPLE = 0x08000000L,
STGM_DIRECT_SWMR = 0x00400000L,
STGM_DELETEONRELEASE = 0x04000000L
}
[Flags]
public enum STGTY : int
{
STGTY_STORAGE = 1,
STGTY_STREAM = 2,
STGTY_LOCKBYTES = 3,
STGTY_PROPERTY = 4
}
[Flags]
public enum STREAM_SEEK : int
{
STREAM_SEEK_SET = 0,
STREAM_SEEK_CUR = 1,
STREAM_SEEK_END = 2
}
}
}
来源:https://stackoverflow.com/questions/7313828/using-ifilter-in-c-sharp-and-retrieving-file-from-database-rather-than-file-syst