Find byte sequence within a byte array

隐身守侯 提交于 2020-01-12 07:50:34

问题


I have a byte array and wish to find the "occurrences" of some bytes.

For example 00 69 73 6F 6D in a very large byte array (> 50/100 Megabytes)

OR

even better a reverse operation: Searching the most common pattern without knowing it the code should be able to read and find it from the file.


回答1:


You can use the Boyer-Moore algorithm to efficiently search for a sequence of bytes in an array of bytes.

Here's a C# version I converted from the Java version from the Wikipedia entry on Boyer-Moore.

public sealed class BoyerMoore
{
    readonly byte[] needle;
    readonly int[] charTable;
    readonly int[] offsetTable;

    public BoyerMoore(byte[] needle)
    {
        this.needle = needle;
        this.charTable = makeByteTable(needle);
        this.offsetTable = makeOffsetTable(needle);
    }

    public IEnumerable<int> Search(byte[] haystack)
    {
        if (needle.Length == 0)
            yield break;

        for (int i = needle.Length - 1; i < haystack.Length;)
        {
            int j;

            for (j = needle.Length - 1; needle[j] == haystack[i]; --i, --j)
            {
                if (j != 0)
                    continue;

                yield return i;
                i += needle.Length - 1;
                break;
            }

            i += Math.Max(offsetTable[needle.Length - 1 - j], charTable[haystack[i]]);
        }
    }

    static int[] makeByteTable(byte[] needle)
    {
        const int ALPHABET_SIZE = 256;
        int[] table = new int[ALPHABET_SIZE];

        for (int i = 0; i < table.Length; ++i)
            table[i] = needle.Length;

        for (int i = 0; i < needle.Length - 1; ++i)
            table[needle[i]] = needle.Length - 1 - i;

        return table;
    }

    static int[] makeOffsetTable(byte[] needle)
    {
        int[] table = new int[needle.Length];
        int lastPrefixPosition = needle.Length;

        for (int i = needle.Length - 1; i >= 0; --i)
        {
            if (isPrefix(needle, i + 1))
                lastPrefixPosition = i + 1;

            table[needle.Length - 1 - i] = lastPrefixPosition - i + needle.Length - 1;
        }

        for (int i = 0; i < needle.Length - 1; ++i)
        {
            int slen = suffixLength(needle, i);
            table[slen] = needle.Length - 1 - i + slen;
        }

        return table;
    }

    static bool isPrefix(byte[] needle, int p)
    {
        for (int i = p, j = 0; i < needle.Length; ++i, ++j)
            if (needle[i] != needle[j])
                return false;

        return true;
    }

    static int suffixLength(byte[] needle, int p)
    {
        int len = 0;

        for (int i = p, j = needle.Length - 1; i >= 0 && needle[i] == needle[j]; --i, --j)
            ++len;

        return len;
    }
}

Here's some console app test code for it:

public static void Main()
{
    byte[] haystack = new byte[10000];
    byte[] needle = { 0x00, 0x69, 0x73, 0x6F, 0x6D };

    // Put a few copies of the needle into the haystack.

    for (int i = 1000; i <= 9000; i += 1000) 
        Array.Copy(needle, 0, haystack, i, needle.Length);

    var searcher = new BoyerMoore(needle);

    foreach (int index in searcher.Search(haystack))
        Console.WriteLine(index);
}

Note how the Search() method returns the indices of all the locations of the start of needle inside haystack.

If you just wanted the count, you could just do:

int count = new BoyerMoore(needle).Search(haystack).Count();

For your second question: I assume you are asking about finding the longest repeated sequence of bytes?

That's a much more complicated - and very different - question. If you want an answer for that, you should ask a separate question for it, but you should read the Wikipedia entry on the "longest repeated substring problem".



来源:https://stackoverflow.com/questions/37500629/find-byte-sequence-within-a-byte-array

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!