C Library for compressing sequential positive integers

前端 未结 6 1503
难免孤独
难免孤独 2021-02-05 19:39

I have the very common problem of creating an index for an in-disk array of strings. In short, I need to store the position of each string in the in-disk representation. For exa

6条回答
  •  南笙
    南笙 (楼主)
    2021-02-05 20:05

    I did something similar years ago for a full-text search engine. In my case, each indexed word generated a record which consisted of a record number (document id) and a word number (it could just as easily have stored word offsets) which needed to be compressed as much as possible. I used a delta-compression technique which took advantage of the fact that there would be a number of occurrences of the same word within a document, so the record number often did not need to be repeated at all. And the word offset delta would often fit within one or two bytes. Here is the code I used.

    Since it's in C++, the code may is not going to be useful to you as is, but can be a good starting point for writing compressions routines.

    Please excuse the hungarian notation and the magic numbers strewn within the code. Like I said, I wrote this many years ago :-)

    IndexCompressor.h

    //
    // index compressor class
    //
    
    #pragma once
    
    #include "File.h"
    
    const int IC_BUFFER_SIZE = 8192;
    
    //
    // index compressor
    //
    class IndexCompressor
    {
    private :
       File        *m_pFile;
       WA_DWORD    m_dwRecNo;
       WA_DWORD    m_dwWordNo;
       WA_DWORD    m_dwRecordCount;
       WA_DWORD    m_dwHitCount;
    
       WA_BYTE     m_byBuffer[IC_BUFFER_SIZE];
       WA_DWORD    m_dwBytes;
    
       bool        m_bDebugDump;
    
       void FlushBuffer(void);
    
    public :
       IndexCompressor(void) { m_pFile = 0; m_bDebugDump = false; }
       ~IndexCompressor(void) {}
    
       void Attach(File& File) { m_pFile = &File; }
    
       void Begin(void);
       void Add(WA_DWORD dwRecNo, WA_DWORD dwWordNo);
       void End(void);
    
       WA_DWORD GetRecordCount(void) { return m_dwRecordCount; }
       WA_DWORD GetHitCount(void) { return m_dwHitCount; }
    
       void DebugDump(void) { m_bDebugDump = true; }
    };
    

    IndexCompressor.cpp

    //
    // index compressor class
    //
    
    #include "stdafx.h"
    #include "IndexCompressor.h"
    
    void IndexCompressor::FlushBuffer(void)
    {
       ASSERT(m_pFile != 0);
    
       if (m_dwBytes > 0)
       {
          m_pFile->Write(m_byBuffer, m_dwBytes);
          m_dwBytes = 0;
       }
    }
    
    void IndexCompressor::Begin(void)
    {
       ASSERT(m_pFile != 0);
       m_dwRecNo = m_dwWordNo = m_dwRecordCount = m_dwHitCount = 0;
       m_dwBytes = 0;
    }
    
    void IndexCompressor::Add(WA_DWORD dwRecNo, WA_DWORD dwWordNo)
    {
       ASSERT(m_pFile != 0);
       WA_BYTE buffer[16];
       int nbytes = 1;
    
       ASSERT(dwRecNo >= m_dwRecNo);
    
       if (dwRecNo != m_dwRecNo)
          m_dwWordNo = 0;
       if (m_dwRecordCount == 0 || dwRecNo != m_dwRecNo)
          ++m_dwRecordCount;
       ++m_dwHitCount;
    
       WA_DWORD dwRecNoDelta = dwRecNo - m_dwRecNo;
       WA_DWORD dwWordNoDelta = dwWordNo - m_dwWordNo;
    
       if (m_bDebugDump)
       {
          TRACE("%8X[%8X] %8X[%8X] : ", dwRecNo, dwRecNoDelta, dwWordNo, dwWordNoDelta);
       }
    
       // 1WWWWWWW
       if (dwRecNoDelta == 0 && dwWordNoDelta < 128)
       {
          buffer[0] = 0x80 | WA_BYTE(dwWordNoDelta);
       }
       // 01WWWWWW WWWWWWWW
       else if (dwRecNoDelta == 0 && dwWordNoDelta < 16384)
       {
          buffer[0] = 0x40 | WA_BYTE(dwWordNoDelta >> 8);
          buffer[1] = WA_BYTE(dwWordNoDelta & 0x00ff);
          nbytes += sizeof(WA_BYTE);
       }
       // 001RRRRR WWWWWWWW WWWWWWWW
       else if (dwRecNoDelta < 32 && dwWordNoDelta < 65536)
       {
          buffer[0] = 0x20 | WA_BYTE(dwRecNoDelta);
          WA_WORD *p = (WA_WORD *) (buffer+1);
          *p = WA_WORD(dwWordNoDelta);
          nbytes += sizeof(WA_WORD);
       }
       else
       {
          // 0001rrww
          buffer[0] = 0x10;
    
          // encode recno
          if (dwRecNoDelta < 256)
          {
             buffer[nbytes] = WA_BYTE(dwRecNoDelta);
             nbytes += sizeof(WA_BYTE);
          }
          else if (dwRecNoDelta < 65536)
          {
             buffer[0] |= 0x04;
             WA_WORD *p = (WA_WORD *) (buffer+nbytes);
             *p = WA_WORD(dwRecNoDelta);
             nbytes += sizeof(WA_WORD);
          }
          else
          {
             buffer[0] |= 0x08;
             WA_DWORD *p = (WA_DWORD *) (buffer+nbytes);
             *p = dwRecNoDelta;
             nbytes += sizeof(WA_DWORD);
          }
    
          // encode wordno
          if (dwWordNoDelta < 256)
          {
             buffer[nbytes] = WA_BYTE(dwWordNoDelta);
             nbytes += sizeof(WA_BYTE);
          }
          else if (dwWordNoDelta < 65536)
          {
             buffer[0] |= 0x01;
             WA_WORD *p = (WA_WORD *) (buffer+nbytes);
             *p = WA_WORD(dwWordNoDelta);
             nbytes += sizeof(WA_WORD);
          }
          else
          {
             buffer[0] |= 0x02;
             WA_DWORD *p = (WA_DWORD *) (buffer+nbytes);
             *p = dwWordNoDelta;
             nbytes += sizeof(WA_DWORD);
          }
       }
    
       // update current setting
       m_dwRecNo = dwRecNo;
       m_dwWordNo = dwWordNo;
    
       // add compressed data to buffer
       ASSERT(buffer[0] != 0);
       ASSERT(nbytes > 0 && nbytes < 10);
       if (m_dwBytes + nbytes > IC_BUFFER_SIZE)
          FlushBuffer();
       CopyMemory(m_byBuffer + m_dwBytes, buffer, nbytes);
       m_dwBytes += nbytes;
    
       if (m_bDebugDump)
       {
          for (int i = 0; i < nbytes; ++i)
             TRACE("%02X ", buffer[i]);
          TRACE("\n");
       }
    }
    
    void IndexCompressor::End(void)
    {
       FlushBuffer();
       m_pFile->Write(WA_BYTE(0));
    }
    

提交回复
热议问题