Delphi IFilter implementation

前端 未结 2 1725
我在风中等你
我在风中等你 2021-01-06 09:34

I need to implement an IFilter in Delphi 2010 that can search through Office 2007 docx files and return the text found in the document.

The ifilter also needs to use

相关标签:
2条回答
  • 2021-01-06 10:14

    You don't want to implement an IFilter to parse an Office 2007 docx. You want to use Microsoft's already written IFilter objects, so that you can learn the contents of a docx file.

    Then you use standard IFilter mechanisms to parse the file contents:

    procedure TForm1.ProcessFile(filename: string);
    var
        Filter: IFilter;
        hr: HRESULT;
        chunk: PSTAT_CHUNK;
    //  attr: FULLPROPSPEC;
        flags: ULONG;
        c: Cardinal;
        buffer: WideString;
    begin
        Log('Processing "'+filename+'"');
    
        Log('Calling LoadIFilter');
        filter := LoadIFilter(filename);
        if filter = nil then
        begin
            Log('filter is null; leaving');
            Exit;
        end;
        try
            Log('Calling filter.Init(IFILTER_INIT_INDEXING_ONLY)');
            hr := filter.Init(IFILTER_INIT_INDEXING_ONLY, 0, nil, flags);
            OleCheck(hr);
    
            Log('Init returned sucessfully, looking for chunks...');
            while True do
            begin
                New(chunk);
                try
                    hr := filter.GetChunk(chunk);
                    if Failed(hr) then
                    begin
                        Log('No more chunks: '+IntToHex(hr, 8)+' ('+GetChunkHresultToStr(hr)+')');
                        Break;
                    end;
    
                    Log('== Got chunk. ChunkType='+IntToStr(chunk.flags)+' (1=text, 2=value) ==');
    
                    if (chunk.Flags and CHUNK_TEXT) = CHUNK_TEXT then
                    begin
                        c := 2048;
                        SetLength(buffer, c);
                        hr := filter.GetText(c, PWideChar(buffer));
                        if Succeeded(hr) then
                        begin
                            Log('=== Got text ===');
                            SetLength(buffer, c);
                            Log(buffer);
    
                            while Succeeded(hr) do
                            begin
                                c := 2048;
                                SetLength(buffer, c);
                                hr := filter.GetText(c, PWideChar(buffer));
                                if Succeeded(hr) then
                                begin
                                    SetLength(buffer, c);
                                    Log('==== Really long chunk, here''s the next 2048 characters ====');
                                    Log(buffer);
                                end;
                            end;
                        end
                        else
                        begin
                            Log('Could not get text from chunk: '+IntToHex(hr, 8)+' ('+GetChunkHResultToStr(hr)+')');
                            Log('   It might be a "Value" chunk, meaning i should call filter.GetValue rather than filter.GetText. But i''m too lazy');
                        end;
                    end
                    else if (chunk.flags and CHUNK_VALUE) = CHUNK_VALUE then
                    begin
                        Log('This is a "VALUE" chunk. i''m not going to read anything out of it cause it''s too hard :(');
                    end
                    else
                        Log('Unknown chunk type');
                finally
                Dispose(chunk);
                end;
            end; //end while true getting chunks
        finally
            filter := nil;
        end;
    end;
    

    Where Windows already provides the code that loads an IFilter for a specified filename:

    function TForm1.LoadIFilter(const filename: WideString): IFilter;
    var
        hr: HRESULT;
        unk: IUnknown;
    begin
        hr := ntQuery.LoadIFilter(PWideChar(filename), nil, unk);
        OleCheck(hr);
    
        Result := unk as IFilter;
    end;
    

    IFilter declarations unit:

    unit Filter;
    
    interface
    
    uses
      Windows, SysUtils, Classes, ActiveX;
    
    type
        IFILTER_INIT = TOleEnum;
    const
        IFILTER_INIT_CANON_PARAGRAPHS             = 1;
        IFILTER_INIT_HARD_LINE_BREAKS             = 2;
        IFILTER_INIT_CANON_HYPHENS                = 4;
        IFILTER_INIT_CANON_SPACES                 = 8;
        IFILTER_INIT_APPLY_INDEX_ATTRIBUTES   = 16;
        IFILTER_INIT_APPLY_OTHER_ATTRIBUTES   = 32;
        IFILTER_INIT_INDEXING_ONLY                = 64;
        IFILTER_INIT_SEARCH_LINKS                 = 128;
    
    type
        IFILTER_FLAGS = TOleEnum;
    const
        IFILTER_FLAGS_OLE_PROPERTIES = 1;
    
    type
        CHUNKSTATE = TOleEnum;
    const
        CHUNK_TEXT =    $01;
        CHUNK_VALUE =   $02;
    
    type
        CHUNK_BREAKTYPE = TOleEnum;
    const
        CHUNK_NO_BREAK =    0;
        CHUNK_EOW =         1;
        CHUNK_EOS =         2;
        CHUNK_EOP =         3;
        CHUNK_EOC =         4;
    
    type
        FILTERREGION = packed record
            idChunk: ULONG;
            cwcStart: ULONG;
            cwcExtent: ULONG;
        end;
        tagFILTERREGION = FILTERREGION;
    
    
    const
        PRSPEC_LPWSTR =     0;
        PRSPEC_PROPID =     1;
    
    type
        PROPID = ULONG;
    
    type
        PROPSPEC = packed record
            ulKind: ULONG;
            case integer of
            0: (prid: PROPID);
            1: (lpws: PWideChar);
        end;
        tagPROPSPEC = PROPSPEC;
    
    type
        FULLPROPSPEC = packed record
            guidPropSet: TGUID;
            psProperty: PROPSPEC;
        end;
        tagFULLPROPSPEC =   FULLPROPSPEC;
        PFULLPROPSPEC =         ^FULLPROPSPEC;
    
    type
        STAT_CHUNK = packed record
            idChunk: ULONG;
            breakType: CHUNK_BREAKTYPE;
            flags: CHUNKSTATE;
            locale: LCID;
            attribute: FULLPROPSPEC;
            idChunkSource: ULONG;
            cwcStartSource: ULONG;
            cwcLenSource: ULONG;
        end;
        tagSTAT_CHUNK =     STAT_CHUNK;
        PSTAT_CHUNK =       ^STAT_CHUNK;
    
    // From filtererr.h
    const
        FILTER_E_END_OF_CHUNKS = HRESULT($80041700);
    
    //
    // MessageId: FILTER_E_NO_MORE_TEXT
    //
    // MessageText:
    //
    //  No more text available in chunk.
    //
    const
        FILTER_E_NO_MORE_TEXT = HRESULT($80041701);
    
    //
    // MessageId: FILTER_E_NO_MORE_VALUES
    //
    // MessageText:
    //
    //  No more property values available in chunk.
    //
    const
        FILTER_E_NO_MORE_VALUES = HRESULT($80041702);
    
    //
    // MessageId: FILTER_E_ACCESS
    //
    // MessageText:
    //
    //  Unable to access object.
    //
    const
        FILTER_E_ACCESS = HRESULT($80041703);
    
    //
    // MessageId: FILTER_W_MONIKER_CLIPPED
    //
    // MessageText:
    //
    //  Moniker doesn't cover entire region.
    //
    const
        FILTER_W_MONIKER_CLIPPED = HRESULT($80041704);
    
    //
    // MessageId: FILTER_E_NO_TEXT
    //
    // MessageText:
    //
    //  No text in current chunk.
    //
    const
        FILTER_E_NO_TEXT = HRESULT($80041705);
    
    //
    // MessageId: FILTER_E_NO_VALUES
    //
    // MessageText:
    //
    //  No values in current chunk.
    //
    const
        FILTER_E_NO_VALUES = HRESULT($80041706);
    
    //
    // MessageId: FILTER_E_EMBEDDING_UNAVAILABLE
    //
    // MessageText:
    //
    //  Unable to bind IFilter for embedded object.
    //
    const
        FILTER_E_EMBEDDING_UNAVAILABLE = HRESULT($80041707);
    
    //
    // MessageId: FILTER_E_LINK_UNAVAILABLE
    //
    // MessageText:
    //
    //  Unable to bind IFilter for linked object.
    //
    const
        FILTER_E_LINK_UNAVAILABLE             =  HRESULT($80041708);
    
    //
    // MessageId: FILTER_S_LAST_TEXT
    //
    // MessageText:
    //
    //  This is the last text in the current chunk.
    //
    const
        FILTER_S_LAST_TEXT = HRESULT($00041709);
    
    //
    // MessageId: FILTER_S_LAST_VALUES
    //
    // MessageText:
    //
    //  This is the last value in the current chunk.
    //
    const
        FILTER_S_LAST_VALUES = HRESULT($0004170A);
    
    //
    // MessageId: FILTER_E_PASSWORD
    //
    // MessageText:
    //
    //  File was not filtered due to password protection.
    //
    const
        FILTER_E_PASSWORD = HRESULT($8004170B);
    
    //
    // MessageId: FILTER_E_UNKNOWNFORMAT
    //
    // MessageText:
    //
    //  The document format is not recognized by the flter.
    //
    const
        FILTER_E_UNKNOWNFORMAT = HRESULT($8004170C);
    
    
    const
        IID_IFilter: TGUID = '{89BCB740-6119-101A-BCB7-00DD010655AF}';
    
    type
        IFilter = interface(IUnknown)
            ['{89BCB740-6119-101A-BCB7-00DD010655AF}']
            function Init(grfFlags: ULONG; cAttributes: ULONG; aAttributes: PFULLPROPSPEC; out pFlags: ULONG): HResult; stdcall;
            function GetChunk(pStat: PSTAT_CHUNK): HResult; stdcall;
            function GetText(var pcwcBuffer: ULONG; awcBuffer: PWideChar): HResult; stdcall;
            function GetValue(out ppPropValue: PROPVARIANT): HResult; stdcall;
            function BindRegion(origPos: FILTERREGION; riid: TGUID; out ppUnk): HResult; stdcall;
        end;
    
    implementation
    
    end.
    
    0 讨论(0)
  • 2021-01-06 10:23

    If you seek out the old Borland/CodeGear newsgroups, then you may find references to an IFilter implementation by "Soluciones Vulcano" which has reference to develop.shorterpath.com which still seems to exist. Beyond that, I've never seen any other implementation component, and I've not yet managed to look at it myself.

    0 讨论(0)
提交回复
热议问题