I have the very common problem of creating an index for an in-disk array of strings. In short, I need to store the position of each string in the in-disk representation. For exa
I did something similar years ago for a full-text search engine. In my case, each indexed word generated a record which consisted of a record number (document id) and a word number (it could just as easily have stored word offsets) which needed to be compressed as much as possible. I used a delta-compression technique which took advantage of the fact that there would be a number of occurrences of the same word within a document, so the record number often did not need to be repeated at all. And the word offset delta would often fit within one or two bytes. Here is the code I used.
Since it's in C++, the code may is not going to be useful to you as is, but can be a good starting point for writing compressions routines.
Please excuse the hungarian notation and the magic numbers strewn within the code. Like I said, I wrote this many years ago :-)
//
// index compressor class
//
#pragma once
#include "File.h"
const int IC_BUFFER_SIZE = 8192;
//
// index compressor
//
class IndexCompressor
{
private :
File *m_pFile;
WA_DWORD m_dwRecNo;
WA_DWORD m_dwWordNo;
WA_DWORD m_dwRecordCount;
WA_DWORD m_dwHitCount;
WA_BYTE m_byBuffer[IC_BUFFER_SIZE];
WA_DWORD m_dwBytes;
bool m_bDebugDump;
void FlushBuffer(void);
public :
IndexCompressor(void) { m_pFile = 0; m_bDebugDump = false; }
~IndexCompressor(void) {}
void Attach(File& File) { m_pFile = &File; }
void Begin(void);
void Add(WA_DWORD dwRecNo, WA_DWORD dwWordNo);
void End(void);
WA_DWORD GetRecordCount(void) { return m_dwRecordCount; }
WA_DWORD GetHitCount(void) { return m_dwHitCount; }
void DebugDump(void) { m_bDebugDump = true; }
};
//
// index compressor class
//
#include "stdafx.h"
#include "IndexCompressor.h"
void IndexCompressor::FlushBuffer(void)
{
ASSERT(m_pFile != 0);
if (m_dwBytes > 0)
{
m_pFile->Write(m_byBuffer, m_dwBytes);
m_dwBytes = 0;
}
}
void IndexCompressor::Begin(void)
{
ASSERT(m_pFile != 0);
m_dwRecNo = m_dwWordNo = m_dwRecordCount = m_dwHitCount = 0;
m_dwBytes = 0;
}
void IndexCompressor::Add(WA_DWORD dwRecNo, WA_DWORD dwWordNo)
{
ASSERT(m_pFile != 0);
WA_BYTE buffer[16];
int nbytes = 1;
ASSERT(dwRecNo >= m_dwRecNo);
if (dwRecNo != m_dwRecNo)
m_dwWordNo = 0;
if (m_dwRecordCount == 0 || dwRecNo != m_dwRecNo)
++m_dwRecordCount;
++m_dwHitCount;
WA_DWORD dwRecNoDelta = dwRecNo - m_dwRecNo;
WA_DWORD dwWordNoDelta = dwWordNo - m_dwWordNo;
if (m_bDebugDump)
{
TRACE("%8X[%8X] %8X[%8X] : ", dwRecNo, dwRecNoDelta, dwWordNo, dwWordNoDelta);
}
// 1WWWWWWW
if (dwRecNoDelta == 0 && dwWordNoDelta < 128)
{
buffer[0] = 0x80 | WA_BYTE(dwWordNoDelta);
}
// 01WWWWWW WWWWWWWW
else if (dwRecNoDelta == 0 && dwWordNoDelta < 16384)
{
buffer[0] = 0x40 | WA_BYTE(dwWordNoDelta >> 8);
buffer[1] = WA_BYTE(dwWordNoDelta & 0x00ff);
nbytes += sizeof(WA_BYTE);
}
// 001RRRRR WWWWWWWW WWWWWWWW
else if (dwRecNoDelta < 32 && dwWordNoDelta < 65536)
{
buffer[0] = 0x20 | WA_BYTE(dwRecNoDelta);
WA_WORD *p = (WA_WORD *) (buffer+1);
*p = WA_WORD(dwWordNoDelta);
nbytes += sizeof(WA_WORD);
}
else
{
// 0001rrww
buffer[0] = 0x10;
// encode recno
if (dwRecNoDelta < 256)
{
buffer[nbytes] = WA_BYTE(dwRecNoDelta);
nbytes += sizeof(WA_BYTE);
}
else if (dwRecNoDelta < 65536)
{
buffer[0] |= 0x04;
WA_WORD *p = (WA_WORD *) (buffer+nbytes);
*p = WA_WORD(dwRecNoDelta);
nbytes += sizeof(WA_WORD);
}
else
{
buffer[0] |= 0x08;
WA_DWORD *p = (WA_DWORD *) (buffer+nbytes);
*p = dwRecNoDelta;
nbytes += sizeof(WA_DWORD);
}
// encode wordno
if (dwWordNoDelta < 256)
{
buffer[nbytes] = WA_BYTE(dwWordNoDelta);
nbytes += sizeof(WA_BYTE);
}
else if (dwWordNoDelta < 65536)
{
buffer[0] |= 0x01;
WA_WORD *p = (WA_WORD *) (buffer+nbytes);
*p = WA_WORD(dwWordNoDelta);
nbytes += sizeof(WA_WORD);
}
else
{
buffer[0] |= 0x02;
WA_DWORD *p = (WA_DWORD *) (buffer+nbytes);
*p = dwWordNoDelta;
nbytes += sizeof(WA_DWORD);
}
}
// update current setting
m_dwRecNo = dwRecNo;
m_dwWordNo = dwWordNo;
// add compressed data to buffer
ASSERT(buffer[0] != 0);
ASSERT(nbytes > 0 && nbytes < 10);
if (m_dwBytes + nbytes > IC_BUFFER_SIZE)
FlushBuffer();
CopyMemory(m_byBuffer + m_dwBytes, buffer, nbytes);
m_dwBytes += nbytes;
if (m_bDebugDump)
{
for (int i = 0; i < nbytes; ++i)
TRACE("%02X ", buffer[i]);
TRACE("\n");
}
}
void IndexCompressor::End(void)
{
FlushBuffer();
m_pFile->Write(WA_BYTE(0));
}