声明
snort中的字串查找,以下代码出自snort-2.9.6.0。是snort中用于字串匹配的底层接口。主要是参考了Boyer-Moore算法进行实现的.
关于Boyer-Moore的文章可以参考: http://my.oschina.net/u/572632/blog/283380
本文主要是对该部分代码做一分析总结。
代码
/*
* bmh.h
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License Version 2 as
* published by the Free Software Foundation. You may not use, modify or
* distribute this program under any other version of the GNU General
* Public License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright (C) 2014 Cisco and/or its affiliates. All rights reserved.
* Copyright (C) 2005-2013 Sourcefire, Inc.
*
* Author: Marc Norton
*
* Date: 5/2005
*
* Boyer-Moore-Horsepool for small pattern groups
*
*/
#ifndef BOYER_MOORE_HORSPOOL
#define BOYER_MOORE_HORSPOOL
#define HBM_STATIC
typedef struct {
unsigned char *P; /*原始模式串*/
unsigned char *Pnc; /*全部转换为大写的模式串*/
int M; /*模式串长度*/
int bcShift[256]; /*存放坏字符到模式串尾部的距离*/
int nocase; /*大小不敏感标记*/
}HBM_STRUCT;
HBM_STATIC HBM_STRUCT * hbm_prep( unsigned char * pat, int m, int nocase );
HBM_STATIC int hbm_prepx( HBM_STRUCT *p, unsigned char * pat, int m, int nocase );
HBM_STATIC const unsigned char * hbm_match( HBM_STRUCT *p, const unsigned char * text, int n );
HBM_STATIC void hbm_free( HBM_STRUCT *p );
#endif
/*
* bmh.c
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License Version 2 as
* published by the Free Software Foundation. You may not use, modify or
* distribute this program under any other version of the GNU General
* Public License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright (C) 2014 Cisco and/or its affiliates. All rights reserved.
* Copyright (C) 2005-2013 Sourcefire, Inc.
*
* Author: Marc Norton
*
* Date: 5/2005
*
* Boyer-Moore-Horsepool for small pattern groups
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "sf_types.h"
#include "bmh.h"
#include "sf_dynamic_engine.h"
HBM_STATIC
int hbm_prepx (HBM_STRUCT *p, unsigned char * pat, int m, int nocase )
{
int i,k;
unsigned char *t;
if( !m ) return 0;
if( !p ) return 0;
p->P = pat;
p->M = m;
p->nocase = nocase;
if( nocase ) /* nocase 是为了处理大小写不敏感的匹配方式 */
{
t = (unsigned char*)malloc(m);
if ( !t ) return 0;
memcpy(t,pat,m);
for(i=0;i<m;i++)
{
t[i] = (unsigned char)toupper(t[i]);
}
p->Pnc = t;
}
else
{
p->Pnc = 0;
}
/* Compute normal Boyer-Moore Bad Character Shift */
/** 构建坏字符跳转表, 数组中的索引为字符的值, 其中数据为该字符在模式串中最后一次出现与模式串尾部的距离*/
for(k = 0; k < 256; k++) p->bcShift[k] = m;
if( nocase )
{
for(k = 0; k < m; k++)
p->bcShift[ p->Pnc[k] ] = m - k - 1;
}
else
{
for(k = 0; k < m; k++)
p->bcShift[ p->P[k] ] = m - k - 1;
}
return 1;
}
/**
* 分配空间, 不用关注太多
*/
HBM_STATIC
HBM_STRUCT * hbm_prep(unsigned char * pat, int m, int nocase)
{
HBM_STRUCT *p;
p = (HBM_STRUCT*)malloc(sizeof(HBM_STRUCT));
if (!p)
{
DynamicEngineFatalMessage("Failed to allocate memory for pattern matching.");
}
if( !hbm_prepx( p, pat, m, nocase) )
{
DynamicEngineFatalMessage("Error initializing pattern matching. Check arguments.");
}
return p;
}
/*
* 释放空间,不用关注太多
*/
HBM_STATIC
void hbm_free( HBM_STRUCT *p )
{
if(p)
{
if( p->Pnc )free(p->Pnc);
free(p);
}
}
/*
* Boyer-Moore Horspool
* Does NOT use Sentinel Byte(s)
* Scan and Match Loops are unrolled and separated
* Optimized for 1 byte patterns as well
*
*/
HBM_STATIC
const unsigned char * hbm_match(HBM_STRUCT * px, const unsigned char * text, int n)
{
const unsigned char *pat, *t, *et, *q;
int m1, k;
int *bcShift;
if( px->nocase )
{
pat = px->Pnc;
}
else
{
pat = px->P;
}
m1 = px->M-1;
bcShift= px->bcShift;
//printf("bmh_match: pattern=%.*s, %d bytes \n",px->M,pat,px->M);
t = text + m1;
et = text + n;
/**
* 1. t 指向将模式串与text串头部对齐后, 与模式串尾部对齐的text中的字符/
* 2. et 指向text串的尾部
*/
/* Handle 1 Byte patterns - it's a faster loop */
if( !m1 ) /*模式串长度为1时直接匹配*/
{
if( !px->nocase )
{
for( ;t<et; t++ )
if( *t == *pat ) return t;
}
else
{
for( ;t<et; t++ )
if( toupper(*t) == *pat ) return t;
}
return 0;
}
if( !px->nocase )
{
/* Handle MultiByte Patterns */
while( t < et )
{
/* Scan Loop - Bad Character Shift */
/**
* 这里代码的目地是不断匹配,找到能够与模式串有一个末尾字符匹配的字串
* 1.与模式串尾部有一位字符出现匹配时停止
* 2.整个输入字符扫描完后任然无法匹配就匹配失败
*/
do
{
/*同时完成多个操作,充分利用cpu流水线的并行处理,减少操作数*/
t += bcShift[*t];
if( t >= et )return 0;
t += (k=bcShift[*t]);
if( t >= et )return 0;
} while( k );
/* Unrolled Match Loop */
k = m1;
q = t - m1;
while( k >= 4 )
{
/*同时完成多个操作,充分利用cpu流水线的并行处理,减少操作数*/
if( pat[k] != q[k] )goto NoMatch; k--;
if( pat[k] != q[k] )goto NoMatch; k--;
if( pat[k] != q[k] )goto NoMatch; k--;
if( pat[k] != q[k] )goto NoMatch; k--;
}
/* Finish Match Loop */
while( k >= 0 )
{
if( pat[k] != q[k] )goto NoMatch; k--;
}
/* If matched - return 1st char of pattern in text */
return q;
NoMatch:
t++;
}
}
else /* NoCase - convert input string to upper case as we process it */
{
/* Handle MultiByte Patterns */
while( t < et )
{
/* Scan Loop - Bad Character Shift */
do
{
t += bcShift[toupper(*t)];
if( t >= et )return 0;;
t += (k=bcShift[toupper(*t)]);
if( t >= et )return 0;
} while( k );
/* Unrolled Match Loop */
k = m1;
q = t - m1;
while( k >= 4 )
{
if( pat[k] != toupper(q[k]) )goto NoMatchNC; k--;
if( pat[k] != toupper(q[k]) )goto NoMatchNC; k--;
if( pat[k] != toupper(q[k]) )goto NoMatchNC; k--;
if( pat[k] != toupper(q[k]) )goto NoMatchNC; k--;
}
/* Finish Match Loop */
while( k >= 0 )
{
if( pat[k] != toupper(q[k]) )goto NoMatchNC; k--;
}
/* If matched - return 1st char of pattern in text */
return q;
NoMatchNC:
t++;
}
}
return 0;
}
#endif
总结
该段代码主要是利用了BM算法的坏字符处理,而未使用好后缀,一旦找到末尾匹配上一个字符的就使用朴素匹配的方式
在循环中包含多部处理,虽然相互有依赖,但能通过cpu并行处理减少开销
来源:oschina
链接:https://my.oschina.net/u/572632/blog/288664