hyperscan高性能的正则匹配

hyperscan高性能的正则匹配

hyperscan是一个高性能的正则表达式匹配库,由 Intel 开发并开源,旨在提供高速、低延迟的模式匹配能力。

代码地址:https://github.com/intel/hyperscan

使用手册在:http://intel.github.io/hyperscan/dev-reference/index.html

官方说明文档:https://www.intel.com/content/www/us/en/developer/articles/technical/introduction-to-hyperscan.html

hyperscan编译:

hyperscan依赖以下几个库,需要确保这几个库都存在。

第一步:下载最新版本代码

git clone --branch v5.4.2 https://github.com/intel/hyperscan.git

第二步:构造项目

mkdir build && cd build && cmake -DBUILD_SHARED_LIBS=ON ..

第三步:编译安装

make -j 6 && make install

测试代码ths.c:

// gcc -Wall -o test ths.c -lhs

#include <stdio.h>
#include <string.h>
#include <hs/hs.h>

#define PATTERN_MAX 1024
typedef struct _exptns {
    char * exppttns[PATTERN_MAX] ;
    unsigned int expflgs[PATTERN_MAX] ;
    unsigned int expids[PATTERN_MAX] ;
    int num ;
} exptns_t ;

// 增加一个匹配模式
int add_pattern(exptns_t *exptns, char * pattern, unsigned int expflg, unsigned int expid) {
    int id = exptns->num ;
    if( id >= PATTERN_MAX || strlen(pattern) == 0 ) {
        return -1 ; 
    }

    // 1、添加表达式
    exptns->exppttns[id] = (char *)calloc(1, strlen(pattern)+1) ;
    memcpy(exptns->exppttns[id], pattern, strlen(pattern)) ;

    // 2、表达式匹配方式
    exptns->expflgs[id] = expflg ;

    // 3、模式对应的ID号
    exptns->expids[id] = expid ;
    exptns->num++ ;
    return 0 ;
}


// 匹配之后的回调函数
int match_cb(unsigned int id, unsigned long long from, unsigned long long to, unsigned int flags, void *context) {
    printf("match id %u from %llu to %llu %.*s\n", id, from, to, (int)to-(int)from, ((char *)context+from) ) ;
    return 0 ;
}

// 清理表达式、编译库和句柄
void free_hs_data(exptns_t *exptns, hs_database_t ** hsdb, hs_scratch_t ** hsscth){
    // 1、清理表达式数据
    if(exptns) {
        for( int i=0; i<exptns->num; ++i) {
            if (exptns->exppttns[i]) {
                free(exptns->exppttns[i]) ;
                exptns->exppttns[i] = NULL ;
                exptns->expflgs[i] = 0 ;
                exptns->expids[i] = 0 ;
            }
        }
        exptns->num = 0 ;
    }
    // 2、删除句柄
    if(hsscth) 
        if(*hsscth) {
            hs_free_scratch(*hsscth) ;
            *hsscth = NULL ;
        }

    // 3、删除编译后的模式库
    if(hsdb)
        if(*hsdb) {
            hs_free_database(*hsdb) ;
            *hsdb = NULL ;
        }
}

int main(int argc, char *argv[]) {
    exptns_t exptns ;
    memset(&exptns, 0x00, sizeof(exptns)) ;

    // 0、添加 3 个表达式
    add_pattern(&exptns, "abcd",  HS_FLAG_DOTALL | HS_FLAG_CASELESS | HS_FLAG_SOM_LEFTMOST | HS_FLAG_MULTILINE, 101) ;
    add_pattern(&exptns, "opq(rst)u",  HS_FLAG_DOTALL | HS_FLAG_CASELESS | HS_FLAG_SOM_LEFTMOST | HS_FLAG_MULTILINE, 102) ;
    add_pattern(&exptns, "hij",  HS_FLAG_DOTALL | HS_FLAG_CASELESS | HS_FLAG_SOM_LEFTMOST | HS_FLAG_MULTILINE, 103) ;


    hs_database_t * hsdb = NULL ;       // 用来接收编译后的 hs 库地址
    hs_compile_error_t * cmplerr = NULL ;
    hs_error_t hs_ret ;
    // 1、编译
    hs_ret = hs_compile_multi((const char * const *)exptns.exppttns, (const unsigned int*)exptns.expflgs, 
                                (const unsigned int *)exptns.expids, exptns.num, 
                                HS_MODE_BLOCK, NULL, &hsdb, &cmplerr) ;

    if( hs_ret != HS_SUCCESS ) {
        if(cmplerr->expression < 0 ) {
            printf("compile multi failed : %s\n", cmplerr->message) ;
        } else {
            printf("compile multi failed : %s, error id %d rule %s\n", cmplerr->message, 
                            exptns.expids[cmplerr->expression], exptns.exppttns[cmplerr->expression]) ;
        }
        hs_free_compile_error(cmplerr) ;
        free_hs_data(&exptns, NULL, NULL) ;
        return -1 ;
    }

    hs_scratch_t * hsscth = NULL ;
    // 2、获取匹配句柄
    hs_ret = hs_alloc_scratch(hsdb, &hsscth) ;
    if( hs_ret != HS_SUCCESS ) {
        printf("get hs scratch failed !\n") ;
        free_hs_data(&exptns, &hsdb, NULL) ;
        return -1 ;
    }

    // 3、匹配
    char * str = "xxabcdefghijkLMNOPQrstUVWzyxabCDEFghiijklm" ;
    hs_ret = hs_scan(hsdb, str, strlen(str), 0, hsscth, &match_cb, str) ;
    if( hs_ret != HS_SUCCESS ) {
        printf("get hs scan failed !\n") ;
        return -1 ;
    }


    printf("\n===============================\n") ;
    // 测试2
    hs_scratch_t * hsscth2 = NULL ;
    hs_ret = hs_alloc_scratch(hsdb, &hsscth2) ;
    if( hs_ret != HS_SUCCESS ) {
        printf("get hs scratch failed !\n") ;
        free_hs_data(&exptns, &hsdb, &hsscth) ;
        return -1 ;
    }

    hs_ret = hs_scan(hsdb, str, strlen(str), 0, hsscth2, &match_cb, str) ;
    if( hs_ret != HS_SUCCESS ) {
        printf("get hs scan failed !\n") ;
        return -1 ;
    }

    printf("\n--------------------------------------------\n") ;
    // 测试3
    hs_scratch_t * hsscth3 = NULL ;
    hs_ret = hs_clone_scratch(hsscth2, &hsscth3) ;
    if( hs_ret != HS_SUCCESS ) {
        printf("clone hs scratch failed !\n") ;
        free_hs_data(&exptns, &hsdb, &hsscth) ;
        return -1 ;
    }
    free_hs_data(NULL, NULL, &hsscth2) ;

    hs_ret = hs_scan(hsdb, str, strlen(str), 0, hsscth3, &match_cb, str) ;
    if( hs_ret != HS_SUCCESS ) {
        printf("get hs scan failed !\n") ;
        return -1 ;
    }
    free_hs_data(NULL, NULL, &hsscth3) ;


    // 4、空间释放
    free_hs_data(&exptns, &hsdb, &hsscth) ;
    
    return 0 ;
}

编译和测试:

评论

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注