| Thomas Graf | 6408f79 | 2005-06-23 20:59:16 -0700 | [diff] [blame] | 1 | /* | 
|  | 2 | * lib/ts_fsm.c	   A naive finite state machine text search approach | 
|  | 3 | * | 
|  | 4 | *		This program is free software; you can redistribute it and/or | 
|  | 5 | *		modify it under the terms of the GNU General Public License | 
|  | 6 | *		as published by the Free Software Foundation; either version | 
|  | 7 | *		2 of the License, or (at your option) any later version. | 
|  | 8 | * | 
|  | 9 | * Authors:	Thomas Graf <tgraf@suug.ch> | 
|  | 10 | * | 
|  | 11 | * ========================================================================== | 
|  | 12 | * | 
|  | 13 | *   A finite state machine consists of n states (struct ts_fsm_token) | 
|  | 14 | *   representing the pattern as a finite automation. The data is read | 
|  | 15 | *   sequentially on a octet basis. Every state token specifies the number | 
|  | 16 | *   of recurrences and the type of value accepted which can be either a | 
|  | 17 | *   specific character or ctype based set of characters. The available | 
|  | 18 | *   type of recurrences include 1, (0|1), [0 n], and [1 n]. | 
|  | 19 | * | 
|  | 20 | *   The algorithm differs between strict/non-strict mode specyfing | 
|  | 21 | *   whether the pattern has to start at the first octect. Strict mode | 
|  | 22 | *   is enabled by default and can be disabled by inserting | 
|  | 23 | *   TS_FSM_HEAD_IGNORE as the first token in the chain. | 
|  | 24 | * | 
|  | 25 | *   The runtime performance of the algorithm should be around O(n), | 
|  | 26 | *   however while in strict mode the average runtime can be better. | 
|  | 27 | */ | 
|  | 28 |  | 
|  | 29 | #include <linux/config.h> | 
|  | 30 | #include <linux/module.h> | 
|  | 31 | #include <linux/types.h> | 
|  | 32 | #include <linux/string.h> | 
|  | 33 | #include <linux/ctype.h> | 
|  | 34 | #include <linux/textsearch.h> | 
|  | 35 | #include <linux/textsearch_fsm.h> | 
|  | 36 |  | 
|  | 37 | struct ts_fsm | 
|  | 38 | { | 
|  | 39 | unsigned int		ntokens; | 
|  | 40 | struct ts_fsm_token	tokens[0]; | 
|  | 41 | }; | 
|  | 42 |  | 
|  | 43 | /* other values derived from ctype.h */ | 
|  | 44 | #define _A		0x100 /* ascii */ | 
|  | 45 | #define _W		0x200 /* wildcard */ | 
|  | 46 |  | 
|  | 47 | /* Map to _ctype flags and some magic numbers */ | 
|  | 48 | static u16 token_map[TS_FSM_TYPE_MAX+1] = { | 
|  | 49 | [TS_FSM_SPECIFIC] = 0, | 
|  | 50 | [TS_FSM_WILDCARD] = _W, | 
|  | 51 | [TS_FSM_CNTRL]	  = _C, | 
|  | 52 | [TS_FSM_LOWER]	  = _L, | 
|  | 53 | [TS_FSM_UPPER]	  = _U, | 
|  | 54 | [TS_FSM_PUNCT]	  = _P, | 
|  | 55 | [TS_FSM_SPACE]	  = _S, | 
|  | 56 | [TS_FSM_DIGIT]	  = _D, | 
|  | 57 | [TS_FSM_XDIGIT]	  = _D | _X, | 
|  | 58 | [TS_FSM_ALPHA]	  = _U | _L, | 
|  | 59 | [TS_FSM_ALNUM]	  = _U | _L | _D, | 
|  | 60 | [TS_FSM_PRINT]	  = _P | _U | _L | _D | _SP, | 
|  | 61 | [TS_FSM_GRAPH]	  = _P | _U | _L | _D, | 
|  | 62 | [TS_FSM_ASCII]	  = _A, | 
|  | 63 | }; | 
|  | 64 |  | 
|  | 65 | static u16 token_lookup_tbl[256] = { | 
|  | 66 | _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*   0-  3 */ | 
|  | 67 | _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*   4-  7 */ | 
|  | 68 | _W|_A|_C,      _W|_A|_C|_S,  _W|_A|_C|_S,  _W|_A|_C|_S,		/*   8- 11 */ | 
|  | 69 | _W|_A|_C|_S,   _W|_A|_C|_S,  _W|_A|_C,     _W|_A|_C,		/*  12- 15 */ | 
|  | 70 | _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  16- 19 */ | 
|  | 71 | _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  20- 23 */ | 
|  | 72 | _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  24- 27 */ | 
|  | 73 | _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  28- 31 */ | 
|  | 74 | _W|_A|_S|_SP,  _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  32- 35 */ | 
|  | 75 | _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  36- 39 */ | 
|  | 76 | _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  40- 43 */ | 
|  | 77 | _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  44- 47 */ | 
|  | 78 | _W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,		/*  48- 51 */ | 
|  | 79 | _W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,		/*  52- 55 */ | 
|  | 80 | _W|_A|_D,      _W|_A|_D,     _W|_A|_P,     _W|_A|_P,		/*  56- 59 */ | 
|  | 81 | _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  60- 63 */ | 
|  | 82 | _W|_A|_P,      _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U|_X,		/*  64- 67 */ | 
|  | 83 | _W|_A|_U|_X,   _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U,		/*  68- 71 */ | 
|  | 84 | _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  72- 75 */ | 
|  | 85 | _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  76- 79 */ | 
|  | 86 | _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  80- 83 */ | 
|  | 87 | _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  84- 87 */ | 
|  | 88 | _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_P,		/*  88- 91 */ | 
|  | 89 | _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  92- 95 */ | 
|  | 90 | _W|_A|_P,      _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L|_X,		/*  96- 99 */ | 
|  | 91 | _W|_A|_L|_X,   _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L,		/* 100-103 */ | 
|  | 92 | _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 104-107 */ | 
|  | 93 | _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 108-111 */ | 
|  | 94 | _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 112-115 */ | 
|  | 95 | _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 116-119 */ | 
|  | 96 | _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_P,		/* 120-123 */ | 
|  | 97 | _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_C,		/* 124-127 */ | 
|  | 98 | _W,            _W,           _W,           _W,			/* 128-131 */ | 
|  | 99 | _W,            _W,           _W,           _W,			/* 132-135 */ | 
|  | 100 | _W,            _W,           _W,           _W,			/* 136-139 */ | 
|  | 101 | _W,            _W,           _W,           _W,			/* 140-143 */ | 
|  | 102 | _W,            _W,           _W,           _W,			/* 144-147 */ | 
|  | 103 | _W,            _W,           _W,           _W,			/* 148-151 */ | 
|  | 104 | _W,            _W,           _W,           _W,			/* 152-155 */ | 
|  | 105 | _W,            _W,           _W,           _W,			/* 156-159 */ | 
|  | 106 | _W|_S|_SP,     _W|_P,        _W|_P,        _W|_P,		/* 160-163 */ | 
|  | 107 | _W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 164-167 */ | 
|  | 108 | _W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 168-171 */ | 
|  | 109 | _W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 172-175 */ | 
|  | 110 | _W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 176-179 */ | 
|  | 111 | _W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 180-183 */ | 
|  | 112 | _W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 184-187 */ | 
|  | 113 | _W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 188-191 */ | 
|  | 114 | _W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 192-195 */ | 
|  | 115 | _W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 196-199 */ | 
|  | 116 | _W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 200-203 */ | 
|  | 117 | _W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 204-207 */ | 
|  | 118 | _W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 208-211 */ | 
|  | 119 | _W|_U,         _W|_U,        _W|_U,        _W|_P,		/* 212-215 */ | 
|  | 120 | _W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 216-219 */ | 
|  | 121 | _W|_U,         _W|_U,        _W|_U,        _W|_L,		/* 220-223 */ | 
|  | 122 | _W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 224-227 */ | 
|  | 123 | _W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 228-231 */ | 
|  | 124 | _W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 232-235 */ | 
|  | 125 | _W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 236-239 */ | 
|  | 126 | _W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 240-243 */ | 
|  | 127 | _W|_L,         _W|_L,        _W|_L,        _W|_P,		/* 244-247 */ | 
|  | 128 | _W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 248-251 */ | 
|  | 129 | _W|_L,         _W|_L,        _W|_L,        _W|_L};		/* 252-255 */ | 
|  | 130 |  | 
|  | 131 | static inline int match_token(struct ts_fsm_token *t, u8 d) | 
|  | 132 | { | 
|  | 133 | if (t->type) | 
|  | 134 | return (token_lookup_tbl[d] & t->type) != 0; | 
|  | 135 | else | 
|  | 136 | return t->value == d; | 
|  | 137 | } | 
|  | 138 |  | 
|  | 139 | static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state) | 
|  | 140 | { | 
|  | 141 | struct ts_fsm *fsm = ts_config_priv(conf); | 
|  | 142 | struct ts_fsm_token *cur = NULL, *next; | 
|  | 143 | unsigned int match_start, block_idx = 0, tok_idx; | 
|  | 144 | unsigned block_len = 0, strict, consumed = state->offset; | 
|  | 145 | const u8 *data; | 
|  | 146 |  | 
|  | 147 | #define GET_NEXT_BLOCK()		\ | 
|  | 148 | ({	consumed += block_idx;		\ | 
|  | 149 | block_idx = 0;			\ | 
|  | 150 | block_len = conf->get_next_block(consumed, &data, conf, state); }) | 
|  | 151 |  | 
|  | 152 | #define TOKEN_MISMATCH()		\ | 
|  | 153 | do {				\ | 
|  | 154 | if (strict)		\ | 
|  | 155 | goto no_match;	\ | 
|  | 156 | block_idx++;		\ | 
|  | 157 | goto startover;		\ | 
|  | 158 | } while(0) | 
|  | 159 |  | 
|  | 160 | #define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK()) | 
|  | 161 |  | 
|  | 162 | if (end_of_data()) | 
|  | 163 | goto no_match; | 
|  | 164 |  | 
|  | 165 | strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE; | 
|  | 166 |  | 
|  | 167 | startover: | 
|  | 168 | match_start = consumed + block_idx; | 
|  | 169 |  | 
|  | 170 | for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) { | 
|  | 171 | cur = &fsm->tokens[tok_idx]; | 
|  | 172 |  | 
|  | 173 | if (likely(tok_idx < (fsm->ntokens - 1))) | 
|  | 174 | next = &fsm->tokens[tok_idx + 1]; | 
|  | 175 | else | 
|  | 176 | next = NULL; | 
|  | 177 |  | 
|  | 178 | switch (cur->recur) { | 
|  | 179 | case TS_FSM_SINGLE: | 
|  | 180 | if (end_of_data()) | 
|  | 181 | goto no_match; | 
|  | 182 |  | 
|  | 183 | if (!match_token(cur, data[block_idx])) | 
|  | 184 | TOKEN_MISMATCH(); | 
|  | 185 | break; | 
|  | 186 |  | 
|  | 187 | case TS_FSM_PERHAPS: | 
|  | 188 | if (end_of_data() || | 
|  | 189 | !match_token(cur, data[block_idx])) | 
|  | 190 | continue; | 
|  | 191 | break; | 
|  | 192 |  | 
|  | 193 | case TS_FSM_MULTI: | 
|  | 194 | if (end_of_data()) | 
|  | 195 | goto no_match; | 
|  | 196 |  | 
|  | 197 | if (!match_token(cur, data[block_idx])) | 
|  | 198 | TOKEN_MISMATCH(); | 
|  | 199 |  | 
|  | 200 | block_idx++; | 
|  | 201 | /* fall through */ | 
|  | 202 |  | 
|  | 203 | case TS_FSM_ANY: | 
|  | 204 | if (next == NULL) | 
|  | 205 | goto found_match; | 
|  | 206 |  | 
|  | 207 | if (end_of_data()) | 
|  | 208 | continue; | 
|  | 209 |  | 
|  | 210 | while (!match_token(next, data[block_idx])) { | 
|  | 211 | if (!match_token(cur, data[block_idx])) | 
|  | 212 | TOKEN_MISMATCH(); | 
|  | 213 | block_idx++; | 
|  | 214 | if (end_of_data()) | 
|  | 215 | goto no_match; | 
|  | 216 | } | 
|  | 217 | continue; | 
|  | 218 |  | 
|  | 219 | /* | 
|  | 220 | * Optimization: Prefer small local loop over jumping | 
|  | 221 | * back and forth until garbage at head is munched. | 
|  | 222 | */ | 
|  | 223 | case TS_FSM_HEAD_IGNORE: | 
|  | 224 | if (end_of_data()) | 
|  | 225 | continue; | 
|  | 226 |  | 
|  | 227 | while (!match_token(next, data[block_idx])) { | 
|  | 228 | /* | 
|  | 229 | * Special case, don't start over upon | 
|  | 230 | * a mismatch, give the user the | 
|  | 231 | * chance to specify the type of data | 
|  | 232 | * allowed to be ignored. | 
|  | 233 | */ | 
|  | 234 | if (!match_token(cur, data[block_idx])) | 
|  | 235 | goto no_match; | 
|  | 236 |  | 
|  | 237 | block_idx++; | 
|  | 238 | if (end_of_data()) | 
|  | 239 | goto no_match; | 
|  | 240 | } | 
|  | 241 |  | 
|  | 242 | match_start = consumed + block_idx; | 
|  | 243 | continue; | 
|  | 244 | } | 
|  | 245 |  | 
|  | 246 | block_idx++; | 
|  | 247 | } | 
|  | 248 |  | 
|  | 249 | if (end_of_data()) | 
|  | 250 | goto found_match; | 
|  | 251 |  | 
|  | 252 | no_match: | 
|  | 253 | return UINT_MAX; | 
|  | 254 |  | 
|  | 255 | found_match: | 
|  | 256 | state->offset = consumed + block_idx; | 
|  | 257 | return match_start; | 
|  | 258 | } | 
|  | 259 |  | 
|  | 260 | static struct ts_config *fsm_init(const void *pattern, unsigned int len, | 
| Al Viro | dd0fc66 | 2005-10-07 07:46:04 +0100 | [diff] [blame] | 261 | gfp_t gfp_mask) | 
| Thomas Graf | 6408f79 | 2005-06-23 20:59:16 -0700 | [diff] [blame] | 262 | { | 
|  | 263 | int i, err = -EINVAL; | 
|  | 264 | struct ts_config *conf; | 
|  | 265 | struct ts_fsm *fsm; | 
|  | 266 | struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern; | 
|  | 267 | unsigned int ntokens = len / sizeof(*tokens); | 
|  | 268 | size_t priv_size = sizeof(*fsm) + len; | 
|  | 269 |  | 
|  | 270 | if (len  % sizeof(struct ts_fsm_token) || ntokens < 1) | 
|  | 271 | goto errout; | 
|  | 272 |  | 
|  | 273 | for (i = 0; i < ntokens; i++) { | 
|  | 274 | struct ts_fsm_token *t = &tokens[i]; | 
|  | 275 |  | 
|  | 276 | if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX) | 
|  | 277 | goto errout; | 
|  | 278 |  | 
|  | 279 | if (t->recur == TS_FSM_HEAD_IGNORE && | 
|  | 280 | (i != 0 || i == (ntokens - 1))) | 
|  | 281 | goto errout; | 
|  | 282 | } | 
|  | 283 |  | 
|  | 284 | conf = alloc_ts_config(priv_size, gfp_mask); | 
|  | 285 | if (IS_ERR(conf)) | 
|  | 286 | return conf; | 
|  | 287 |  | 
|  | 288 | fsm = ts_config_priv(conf); | 
|  | 289 | fsm->ntokens = ntokens; | 
|  | 290 | memcpy(fsm->tokens, pattern, len); | 
|  | 291 |  | 
|  | 292 | for (i = 0; i < fsm->ntokens; i++) { | 
|  | 293 | struct ts_fsm_token *t = &fsm->tokens[i]; | 
|  | 294 | t->type = token_map[t->type]; | 
|  | 295 | } | 
|  | 296 |  | 
|  | 297 | return conf; | 
|  | 298 |  | 
|  | 299 | errout: | 
|  | 300 | return ERR_PTR(err); | 
|  | 301 | } | 
|  | 302 |  | 
|  | 303 | static void *fsm_get_pattern(struct ts_config *conf) | 
|  | 304 | { | 
|  | 305 | struct ts_fsm *fsm = ts_config_priv(conf); | 
|  | 306 | return fsm->tokens; | 
|  | 307 | } | 
|  | 308 |  | 
|  | 309 | static unsigned int fsm_get_pattern_len(struct ts_config *conf) | 
|  | 310 | { | 
|  | 311 | struct ts_fsm *fsm = ts_config_priv(conf); | 
|  | 312 | return fsm->ntokens * sizeof(struct ts_fsm_token); | 
|  | 313 | } | 
|  | 314 |  | 
|  | 315 | static struct ts_ops fsm_ops = { | 
|  | 316 | .name		  = "fsm", | 
|  | 317 | .find		  = fsm_find, | 
|  | 318 | .init		  = fsm_init, | 
|  | 319 | .get_pattern	  = fsm_get_pattern, | 
|  | 320 | .get_pattern_len  = fsm_get_pattern_len, | 
|  | 321 | .owner		  = THIS_MODULE, | 
|  | 322 | .list		  = LIST_HEAD_INIT(fsm_ops.list) | 
|  | 323 | }; | 
|  | 324 |  | 
|  | 325 | static int __init init_fsm(void) | 
|  | 326 | { | 
|  | 327 | return textsearch_register(&fsm_ops); | 
|  | 328 | } | 
|  | 329 |  | 
|  | 330 | static void __exit exit_fsm(void) | 
|  | 331 | { | 
|  | 332 | textsearch_unregister(&fsm_ops); | 
|  | 333 | } | 
|  | 334 |  | 
|  | 335 | MODULE_LICENSE("GPL"); | 
|  | 336 |  | 
|  | 337 | module_init(init_fsm); | 
|  | 338 | module_exit(exit_fsm); |