| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* Generate assembler source containing symbol information | 
|  | 2 | * | 
|  | 3 | * Copyright 2002       by Kai Germaschewski | 
|  | 4 | * | 
|  | 5 | * This software may be used and distributed according to the terms | 
|  | 6 | * of the GNU General Public License, incorporated herein by reference. | 
|  | 7 | * | 
|  | 8 | * Usage: nm -n vmlinux | scripts/kallsyms [--all-symbols] > symbols.S | 
|  | 9 | * | 
|  | 10 | * ChangeLog: | 
|  | 11 | * | 
|  | 12 | * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com> | 
|  | 13 | *      Changed the compression method from stem compression to "table lookup" | 
|  | 14 | *      compression | 
|  | 15 | * | 
|  | 16 | *      Table compression uses all the unused char codes on the symbols and | 
|  | 17 | *  maps these to the most used substrings (tokens). For instance, it might | 
|  | 18 | *  map char code 0xF7 to represent "write_" and then in every symbol where | 
|  | 19 | *  "write_" appears it can be replaced by 0xF7, saving 5 bytes. | 
|  | 20 | *      The used codes themselves are also placed in the table so that the | 
|  | 21 | *  decompresion can work without "special cases". | 
|  | 22 | *      Applied to kernel symbols, this usually produces a compression ratio | 
|  | 23 | *  of about 50%. | 
|  | 24 | * | 
|  | 25 | */ | 
|  | 26 |  | 
|  | 27 | #include <stdio.h> | 
|  | 28 | #include <stdlib.h> | 
|  | 29 | #include <string.h> | 
|  | 30 | #include <ctype.h> | 
|  | 31 |  | 
|  | 32 | /* maximum token length used. It doesn't pay to increase it a lot, because | 
|  | 33 | * very long substrings probably don't repeat themselves too often. */ | 
|  | 34 | #define MAX_TOK_SIZE		11 | 
|  | 35 | #define KSYM_NAME_LEN		127 | 
|  | 36 |  | 
|  | 37 | /* we use only a subset of the complete symbol table to gather the token count, | 
|  | 38 | * to speed up compression, at the expense of a little compression ratio */ | 
|  | 39 | #define WORKING_SET		1024 | 
|  | 40 |  | 
|  | 41 | /* first find the best token only on the list of tokens that would profit more | 
|  | 42 | * than GOOD_BAD_THRESHOLD. Only if this list is empty go to the "bad" list. | 
|  | 43 | * Increasing this value will put less tokens on the "good" list, so the search | 
|  | 44 | * is faster. However, if the good list runs out of tokens, we must painfully | 
|  | 45 | * search the bad list. */ | 
|  | 46 | #define GOOD_BAD_THRESHOLD	10 | 
|  | 47 |  | 
|  | 48 | /* token hash parameters */ | 
|  | 49 | #define HASH_BITS		18 | 
|  | 50 | #define HASH_TABLE_SIZE		(1 << HASH_BITS) | 
|  | 51 | #define HASH_MASK		(HASH_TABLE_SIZE - 1) | 
|  | 52 | #define HASH_BASE_OFFSET	2166136261U | 
|  | 53 | #define HASH_FOLD(a)		((a)&(HASH_MASK)) | 
|  | 54 |  | 
|  | 55 | /* flags to mark symbols */ | 
|  | 56 | #define SYM_FLAG_VALID		1 | 
|  | 57 | #define SYM_FLAG_SAMPLED	2 | 
|  | 58 |  | 
|  | 59 | struct sym_entry { | 
|  | 60 | unsigned long long addr; | 
|  | 61 | char type; | 
|  | 62 | unsigned char flags; | 
|  | 63 | unsigned char len; | 
|  | 64 | unsigned char *sym; | 
|  | 65 | }; | 
|  | 66 |  | 
|  | 67 |  | 
|  | 68 | static struct sym_entry *table; | 
|  | 69 | static int size, cnt; | 
| David Woodhouse | 075d6eb | 2005-05-05 16:15:09 -0700 | [diff] [blame] | 70 | static unsigned long long _stext, _etext, _sinittext, _einittext, _sextratext, _eextratext; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 71 | static int all_symbols = 0; | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 72 | static char symbol_prefix_char = '\0'; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 73 |  | 
|  | 74 | struct token { | 
|  | 75 | unsigned char data[MAX_TOK_SIZE]; | 
|  | 76 | unsigned char len; | 
|  | 77 | /* profit: the number of bytes that could be saved by inserting this | 
|  | 78 | * token into the table */ | 
|  | 79 | int profit; | 
|  | 80 | struct token *next;	/* next token on the hash list */ | 
|  | 81 | struct token *right;	/* next token on the good/bad list */ | 
|  | 82 | struct token *left;    /* previous token on the good/bad list */ | 
|  | 83 | struct token *smaller; /* token that is less one letter than this one */ | 
|  | 84 | }; | 
|  | 85 |  | 
|  | 86 | struct token bad_head, good_head; | 
|  | 87 | struct token *hash_table[HASH_TABLE_SIZE]; | 
|  | 88 |  | 
|  | 89 | /* the table that holds the result of the compression */ | 
|  | 90 | unsigned char best_table[256][MAX_TOK_SIZE+1]; | 
|  | 91 | unsigned char best_table_len[256]; | 
|  | 92 |  | 
|  | 93 |  | 
|  | 94 | static void | 
|  | 95 | usage(void) | 
|  | 96 | { | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 97 | fprintf(stderr, "Usage: kallsyms [--all-symbols] [--symbol-prefix=<prefix char>] < in.map > out.S\n"); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 98 | exit(1); | 
|  | 99 | } | 
|  | 100 |  | 
|  | 101 | /* | 
|  | 102 | * This ignores the intensely annoying "mapping symbols" found | 
|  | 103 | * in ARM ELF files: $a, $t and $d. | 
|  | 104 | */ | 
|  | 105 | static inline int | 
|  | 106 | is_arm_mapping_symbol(const char *str) | 
|  | 107 | { | 
|  | 108 | return str[0] == '$' && strchr("atd", str[1]) | 
|  | 109 | && (str[2] == '\0' || str[2] == '.'); | 
|  | 110 | } | 
|  | 111 |  | 
|  | 112 | static int | 
|  | 113 | read_symbol(FILE *in, struct sym_entry *s) | 
|  | 114 | { | 
|  | 115 | char str[500]; | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 116 | char *sym; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 117 | int rc; | 
|  | 118 |  | 
|  | 119 | rc = fscanf(in, "%llx %c %499s\n", &s->addr, &s->type, str); | 
|  | 120 | if (rc != 3) { | 
|  | 121 | if (rc != EOF) { | 
|  | 122 | /* skip line */ | 
|  | 123 | fgets(str, 500, in); | 
|  | 124 | } | 
|  | 125 | return -1; | 
|  | 126 | } | 
|  | 127 |  | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 128 | sym = str; | 
|  | 129 | /* skip prefix char */ | 
|  | 130 | if (symbol_prefix_char && str[0] == symbol_prefix_char) | 
|  | 131 | sym++; | 
|  | 132 |  | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 133 | /* Ignore most absolute/undefined (?) symbols. */ | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 134 | if (strcmp(sym, "_stext") == 0) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 135 | _stext = s->addr; | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 136 | else if (strcmp(sym, "_etext") == 0) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 137 | _etext = s->addr; | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 138 | else if (strcmp(sym, "_sinittext") == 0) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 139 | _sinittext = s->addr; | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 140 | else if (strcmp(sym, "_einittext") == 0) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 141 | _einittext = s->addr; | 
| David Woodhouse | 075d6eb | 2005-05-05 16:15:09 -0700 | [diff] [blame] | 142 | else if (strcmp(sym, "_sextratext") == 0) | 
|  | 143 | _sextratext = s->addr; | 
|  | 144 | else if (strcmp(sym, "_eextratext") == 0) | 
|  | 145 | _eextratext = s->addr; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 146 | else if (toupper(s->type) == 'A') | 
|  | 147 | { | 
|  | 148 | /* Keep these useful absolute symbols */ | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 149 | if (strcmp(sym, "__kernel_syscall_via_break") && | 
|  | 150 | strcmp(sym, "__kernel_syscall_via_epc") && | 
|  | 151 | strcmp(sym, "__kernel_sigtramp") && | 
|  | 152 | strcmp(sym, "__gp")) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 153 | return -1; | 
|  | 154 |  | 
|  | 155 | } | 
|  | 156 | else if (toupper(s->type) == 'U' || | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 157 | is_arm_mapping_symbol(sym)) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 158 | return -1; | 
|  | 159 |  | 
|  | 160 | /* include the type field in the symbol name, so that it gets | 
|  | 161 | * compressed together */ | 
|  | 162 | s->len = strlen(str) + 1; | 
|  | 163 | s->sym = (char *) malloc(s->len + 1); | 
|  | 164 | strcpy(s->sym + 1, str); | 
|  | 165 | s->sym[0] = s->type; | 
|  | 166 |  | 
|  | 167 | return 0; | 
|  | 168 | } | 
|  | 169 |  | 
|  | 170 | static int | 
|  | 171 | symbol_valid(struct sym_entry *s) | 
|  | 172 | { | 
|  | 173 | /* Symbols which vary between passes.  Passes 1 and 2 must have | 
|  | 174 | * identical symbol lists.  The kallsyms_* symbols below are only added | 
|  | 175 | * after pass 1, they would be included in pass 2 when --all-symbols is | 
|  | 176 | * specified so exclude them to get a stable symbol list. | 
|  | 177 | */ | 
|  | 178 | static char *special_symbols[] = { | 
|  | 179 | "kallsyms_addresses", | 
|  | 180 | "kallsyms_num_syms", | 
|  | 181 | "kallsyms_names", | 
|  | 182 | "kallsyms_markers", | 
|  | 183 | "kallsyms_token_table", | 
|  | 184 | "kallsyms_token_index", | 
|  | 185 |  | 
|  | 186 | /* Exclude linker generated symbols which vary between passes */ | 
|  | 187 | "_SDA_BASE_",		/* ppc */ | 
|  | 188 | "_SDA2_BASE_",		/* ppc */ | 
|  | 189 | NULL }; | 
|  | 190 | int i; | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 191 | int offset = 1; | 
|  | 192 |  | 
|  | 193 | /* skip prefix char */ | 
|  | 194 | if (symbol_prefix_char && *(s->sym + 1) == symbol_prefix_char) | 
|  | 195 | offset++; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 196 |  | 
|  | 197 | /* if --all-symbols is not specified, then symbols outside the text | 
|  | 198 | * and inittext sections are discarded */ | 
|  | 199 | if (!all_symbols) { | 
|  | 200 | if ((s->addr < _stext || s->addr > _etext) | 
| David Woodhouse | 075d6eb | 2005-05-05 16:15:09 -0700 | [diff] [blame] | 201 | && (s->addr < _sinittext || s->addr > _einittext) | 
|  | 202 | && (s->addr < _sextratext || s->addr > _eextratext)) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 203 | return 0; | 
|  | 204 | /* Corner case.  Discard any symbols with the same value as | 
| David Woodhouse | 075d6eb | 2005-05-05 16:15:09 -0700 | [diff] [blame] | 205 | * _etext _einittext or _eextratext; they can move between pass | 
|  | 206 | * 1 and 2 when the kallsyms data are added.  If these symbols | 
|  | 207 | * move then they may get dropped in pass 2, which breaks the | 
|  | 208 | * kallsyms rules. | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 209 | */ | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 210 | if ((s->addr == _etext && strcmp(s->sym + offset, "_etext")) || | 
| David Woodhouse | 075d6eb | 2005-05-05 16:15:09 -0700 | [diff] [blame] | 211 | (s->addr == _einittext && strcmp(s->sym + offset, "_einittext")) || | 
|  | 212 | (s->addr == _eextratext && strcmp(s->sym + offset, "_eextratext"))) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 213 | return 0; | 
|  | 214 | } | 
|  | 215 |  | 
|  | 216 | /* Exclude symbols which vary between passes. */ | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 217 | if (strstr(s->sym + offset, "_compiled.")) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 218 | return 0; | 
|  | 219 |  | 
|  | 220 | for (i = 0; special_symbols[i]; i++) | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 221 | if( strcmp(s->sym + offset, special_symbols[i]) == 0 ) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 222 | return 0; | 
|  | 223 |  | 
|  | 224 | return 1; | 
|  | 225 | } | 
|  | 226 |  | 
|  | 227 | static void | 
|  | 228 | read_map(FILE *in) | 
|  | 229 | { | 
|  | 230 | while (!feof(in)) { | 
|  | 231 | if (cnt >= size) { | 
|  | 232 | size += 10000; | 
|  | 233 | table = realloc(table, sizeof(*table) * size); | 
|  | 234 | if (!table) { | 
|  | 235 | fprintf(stderr, "out of memory\n"); | 
|  | 236 | exit (1); | 
|  | 237 | } | 
|  | 238 | } | 
|  | 239 | if (read_symbol(in, &table[cnt]) == 0) | 
|  | 240 | cnt++; | 
|  | 241 | } | 
|  | 242 | } | 
|  | 243 |  | 
|  | 244 | static void output_label(char *label) | 
|  | 245 | { | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 246 | if (symbol_prefix_char) | 
|  | 247 | printf(".globl %c%s\n", symbol_prefix_char, label); | 
|  | 248 | else | 
|  | 249 | printf(".globl %s\n", label); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 250 | printf("\tALGN\n"); | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 251 | if (symbol_prefix_char) | 
|  | 252 | printf("%c%s:\n", symbol_prefix_char, label); | 
|  | 253 | else | 
|  | 254 | printf("%s:\n", label); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 255 | } | 
|  | 256 |  | 
|  | 257 | /* uncompress a compressed symbol. When this function is called, the best table | 
|  | 258 | * might still be compressed itself, so the function needs to be recursive */ | 
|  | 259 | static int expand_symbol(unsigned char *data, int len, char *result) | 
|  | 260 | { | 
|  | 261 | int c, rlen, total=0; | 
|  | 262 |  | 
|  | 263 | while (len) { | 
|  | 264 | c = *data; | 
|  | 265 | /* if the table holds a single char that is the same as the one | 
|  | 266 | * we are looking for, then end the search */ | 
|  | 267 | if (best_table[c][0]==c && best_table_len[c]==1) { | 
|  | 268 | *result++ = c; | 
|  | 269 | total++; | 
|  | 270 | } else { | 
|  | 271 | /* if not, recurse and expand */ | 
|  | 272 | rlen = expand_symbol(best_table[c], best_table_len[c], result); | 
|  | 273 | total += rlen; | 
|  | 274 | result += rlen; | 
|  | 275 | } | 
|  | 276 | data++; | 
|  | 277 | len--; | 
|  | 278 | } | 
|  | 279 | *result=0; | 
|  | 280 |  | 
|  | 281 | return total; | 
|  | 282 | } | 
|  | 283 |  | 
|  | 284 | static void | 
|  | 285 | write_src(void) | 
|  | 286 | { | 
|  | 287 | int i, k, off, valid; | 
|  | 288 | unsigned int best_idx[256]; | 
|  | 289 | unsigned int *markers; | 
|  | 290 | char buf[KSYM_NAME_LEN+1]; | 
|  | 291 |  | 
|  | 292 | printf("#include <asm/types.h>\n"); | 
|  | 293 | printf("#if BITS_PER_LONG == 64\n"); | 
|  | 294 | printf("#define PTR .quad\n"); | 
|  | 295 | printf("#define ALGN .align 8\n"); | 
|  | 296 | printf("#else\n"); | 
|  | 297 | printf("#define PTR .long\n"); | 
|  | 298 | printf("#define ALGN .align 4\n"); | 
|  | 299 | printf("#endif\n"); | 
|  | 300 |  | 
|  | 301 | printf(".data\n"); | 
|  | 302 |  | 
|  | 303 | output_label("kallsyms_addresses"); | 
|  | 304 | valid = 0; | 
|  | 305 | for (i = 0; i < cnt; i++) { | 
|  | 306 | if (table[i].flags & SYM_FLAG_VALID) { | 
|  | 307 | printf("\tPTR\t%#llx\n", table[i].addr); | 
|  | 308 | valid++; | 
|  | 309 | } | 
|  | 310 | } | 
|  | 311 | printf("\n"); | 
|  | 312 |  | 
|  | 313 | output_label("kallsyms_num_syms"); | 
|  | 314 | printf("\tPTR\t%d\n", valid); | 
|  | 315 | printf("\n"); | 
|  | 316 |  | 
|  | 317 | /* table of offset markers, that give the offset in the compressed stream | 
|  | 318 | * every 256 symbols */ | 
|  | 319 | markers = (unsigned int *) malloc(sizeof(unsigned int)*((valid + 255) / 256)); | 
|  | 320 |  | 
|  | 321 | output_label("kallsyms_names"); | 
|  | 322 | valid = 0; | 
|  | 323 | off = 0; | 
|  | 324 | for (i = 0; i < cnt; i++) { | 
|  | 325 |  | 
|  | 326 | if (!table[i].flags & SYM_FLAG_VALID) | 
|  | 327 | continue; | 
|  | 328 |  | 
|  | 329 | if ((valid & 0xFF) == 0) | 
|  | 330 | markers[valid >> 8] = off; | 
|  | 331 |  | 
|  | 332 | printf("\t.byte 0x%02x", table[i].len); | 
|  | 333 | for (k = 0; k < table[i].len; k++) | 
|  | 334 | printf(", 0x%02x", table[i].sym[k]); | 
|  | 335 | printf("\n"); | 
|  | 336 |  | 
|  | 337 | off += table[i].len + 1; | 
|  | 338 | valid++; | 
|  | 339 | } | 
|  | 340 | printf("\n"); | 
|  | 341 |  | 
|  | 342 | output_label("kallsyms_markers"); | 
|  | 343 | for (i = 0; i < ((valid + 255) >> 8); i++) | 
|  | 344 | printf("\tPTR\t%d\n", markers[i]); | 
|  | 345 | printf("\n"); | 
|  | 346 |  | 
|  | 347 | free(markers); | 
|  | 348 |  | 
|  | 349 | output_label("kallsyms_token_table"); | 
|  | 350 | off = 0; | 
|  | 351 | for (i = 0; i < 256; i++) { | 
|  | 352 | best_idx[i] = off; | 
|  | 353 | expand_symbol(best_table[i],best_table_len[i],buf); | 
|  | 354 | printf("\t.asciz\t\"%s\"\n", buf); | 
|  | 355 | off += strlen(buf) + 1; | 
|  | 356 | } | 
|  | 357 | printf("\n"); | 
|  | 358 |  | 
|  | 359 | output_label("kallsyms_token_index"); | 
|  | 360 | for (i = 0; i < 256; i++) | 
|  | 361 | printf("\t.short\t%d\n", best_idx[i]); | 
|  | 362 | printf("\n"); | 
|  | 363 | } | 
|  | 364 |  | 
|  | 365 |  | 
|  | 366 | /* table lookup compression functions */ | 
|  | 367 |  | 
|  | 368 | static inline unsigned int rehash_token(unsigned int hash, unsigned char data) | 
|  | 369 | { | 
|  | 370 | return ((hash * 16777619) ^ data); | 
|  | 371 | } | 
|  | 372 |  | 
|  | 373 | static unsigned int hash_token(unsigned char *data, int len) | 
|  | 374 | { | 
|  | 375 | unsigned int hash=HASH_BASE_OFFSET; | 
|  | 376 | int i; | 
|  | 377 |  | 
|  | 378 | for (i = 0; i < len; i++) | 
|  | 379 | hash = rehash_token(hash, data[i]); | 
|  | 380 |  | 
|  | 381 | return HASH_FOLD(hash); | 
|  | 382 | } | 
|  | 383 |  | 
|  | 384 | /* find a token given its data and hash value */ | 
|  | 385 | static struct token *find_token_hash(unsigned char *data, int len, unsigned int hash) | 
|  | 386 | { | 
|  | 387 | struct token *ptr; | 
|  | 388 |  | 
|  | 389 | ptr = hash_table[hash]; | 
|  | 390 |  | 
|  | 391 | while (ptr) { | 
|  | 392 | if ((ptr->len == len) && (memcmp(ptr->data, data, len) == 0)) | 
|  | 393 | return ptr; | 
|  | 394 | ptr=ptr->next; | 
|  | 395 | } | 
|  | 396 |  | 
|  | 397 | return NULL; | 
|  | 398 | } | 
|  | 399 |  | 
|  | 400 | static inline void insert_token_in_group(struct token *head, struct token *ptr) | 
|  | 401 | { | 
|  | 402 | ptr->right = head->right; | 
|  | 403 | ptr->right->left = ptr; | 
|  | 404 | head->right = ptr; | 
|  | 405 | ptr->left = head; | 
|  | 406 | } | 
|  | 407 |  | 
|  | 408 | static inline void remove_token_from_group(struct token *ptr) | 
|  | 409 | { | 
|  | 410 | ptr->left->right = ptr->right; | 
|  | 411 | ptr->right->left = ptr->left; | 
|  | 412 | } | 
|  | 413 |  | 
|  | 414 |  | 
|  | 415 | /* build the counts for all the tokens that start with "data", and have lenghts | 
|  | 416 | * from 2 to "len" */ | 
|  | 417 | static void learn_token(unsigned char *data, int len) | 
|  | 418 | { | 
|  | 419 | struct token *ptr,*last_ptr; | 
|  | 420 | int i, newprofit; | 
|  | 421 | unsigned int hash = HASH_BASE_OFFSET; | 
|  | 422 | unsigned int hashes[MAX_TOK_SIZE + 1]; | 
|  | 423 |  | 
|  | 424 | if (len > MAX_TOK_SIZE) | 
|  | 425 | len = MAX_TOK_SIZE; | 
|  | 426 |  | 
|  | 427 | /* calculate and store the hash values for all the sub-tokens */ | 
|  | 428 | hash = rehash_token(hash, data[0]); | 
|  | 429 | for (i = 2; i <= len; i++) { | 
|  | 430 | hash = rehash_token(hash, data[i-1]); | 
|  | 431 | hashes[i] = HASH_FOLD(hash); | 
|  | 432 | } | 
|  | 433 |  | 
|  | 434 | last_ptr = NULL; | 
|  | 435 | ptr = NULL; | 
|  | 436 |  | 
|  | 437 | for (i = len; i >= 2; i--) { | 
|  | 438 | hash = hashes[i]; | 
|  | 439 |  | 
|  | 440 | if (!ptr) ptr = find_token_hash(data, i, hash); | 
|  | 441 |  | 
|  | 442 | if (!ptr) { | 
|  | 443 | /* create a new token entry */ | 
|  | 444 | ptr = (struct token *) malloc(sizeof(*ptr)); | 
|  | 445 |  | 
|  | 446 | memcpy(ptr->data, data, i); | 
|  | 447 | ptr->len = i; | 
|  | 448 |  | 
|  | 449 | /* when we create an entry, it's profit is 0 because | 
|  | 450 | * we also take into account the size of the token on | 
|  | 451 | * the compressed table. We then subtract GOOD_BAD_THRESHOLD | 
|  | 452 | * so that the test to see if this token belongs to | 
|  | 453 | * the good or bad list, is a comparison to zero */ | 
|  | 454 | ptr->profit = -GOOD_BAD_THRESHOLD; | 
|  | 455 |  | 
|  | 456 | ptr->next = hash_table[hash]; | 
|  | 457 | hash_table[hash] = ptr; | 
|  | 458 |  | 
|  | 459 | insert_token_in_group(&bad_head, ptr); | 
|  | 460 |  | 
|  | 461 | ptr->smaller = NULL; | 
|  | 462 | } else { | 
|  | 463 | newprofit = ptr->profit + (ptr->len - 1); | 
|  | 464 | /* check to see if this token needs to be moved to a | 
|  | 465 | * different list */ | 
|  | 466 | if((ptr->profit < 0) && (newprofit >= 0)) { | 
|  | 467 | remove_token_from_group(ptr); | 
|  | 468 | insert_token_in_group(&good_head,ptr); | 
|  | 469 | } | 
|  | 470 | ptr->profit = newprofit; | 
|  | 471 | } | 
|  | 472 |  | 
|  | 473 | if (last_ptr) last_ptr->smaller = ptr; | 
|  | 474 | last_ptr = ptr; | 
|  | 475 |  | 
|  | 476 | ptr = ptr->smaller; | 
|  | 477 | } | 
|  | 478 | } | 
|  | 479 |  | 
|  | 480 | /* decrease the counts for all the tokens that start with "data", and have lenghts | 
|  | 481 | * from 2 to "len". This function is much simpler than learn_token because we have | 
|  | 482 | * more guarantees (tho tokens exist, the ->smaller pointer is set, etc.) | 
|  | 483 | * The two separate functions exist only because of compression performance */ | 
|  | 484 | static void forget_token(unsigned char *data, int len) | 
|  | 485 | { | 
|  | 486 | struct token *ptr; | 
|  | 487 | int i, newprofit; | 
|  | 488 | unsigned int hash=0; | 
|  | 489 |  | 
|  | 490 | if (len > MAX_TOK_SIZE) len = MAX_TOK_SIZE; | 
|  | 491 |  | 
|  | 492 | hash = hash_token(data, len); | 
|  | 493 | ptr = find_token_hash(data, len, hash); | 
|  | 494 |  | 
|  | 495 | for (i = len; i >= 2; i--) { | 
|  | 496 |  | 
|  | 497 | newprofit = ptr->profit - (ptr->len - 1); | 
|  | 498 | if ((ptr->profit >= 0) && (newprofit < 0)) { | 
|  | 499 | remove_token_from_group(ptr); | 
|  | 500 | insert_token_in_group(&bad_head, ptr); | 
|  | 501 | } | 
|  | 502 | ptr->profit=newprofit; | 
|  | 503 |  | 
|  | 504 | ptr=ptr->smaller; | 
|  | 505 | } | 
|  | 506 | } | 
|  | 507 |  | 
|  | 508 | /* count all the possible tokens in a symbol */ | 
|  | 509 | static void learn_symbol(unsigned char *symbol, int len) | 
|  | 510 | { | 
|  | 511 | int i; | 
|  | 512 |  | 
|  | 513 | for (i = 0; i < len - 1; i++) | 
|  | 514 | learn_token(symbol + i, len - i); | 
|  | 515 | } | 
|  | 516 |  | 
|  | 517 | /* decrease the count for all the possible tokens in a symbol */ | 
|  | 518 | static void forget_symbol(unsigned char *symbol, int len) | 
|  | 519 | { | 
|  | 520 | int i; | 
|  | 521 |  | 
|  | 522 | for (i = 0; i < len - 1; i++) | 
|  | 523 | forget_token(symbol + i, len - i); | 
|  | 524 | } | 
|  | 525 |  | 
|  | 526 | /* set all the symbol flags and do the initial token count */ | 
|  | 527 | static void build_initial_tok_table(void) | 
|  | 528 | { | 
|  | 529 | int i, use_it, valid; | 
|  | 530 |  | 
|  | 531 | valid = 0; | 
|  | 532 | for (i = 0; i < cnt; i++) { | 
|  | 533 | table[i].flags = 0; | 
|  | 534 | if ( symbol_valid(&table[i]) ) { | 
|  | 535 | table[i].flags |= SYM_FLAG_VALID; | 
|  | 536 | valid++; | 
|  | 537 | } | 
|  | 538 | } | 
|  | 539 |  | 
|  | 540 | use_it = 0; | 
|  | 541 | for (i = 0; i < cnt; i++) { | 
|  | 542 |  | 
|  | 543 | /* subsample the available symbols. This method is almost like | 
|  | 544 | * a Bresenham's algorithm to get uniformly distributed samples | 
|  | 545 | * across the symbol table */ | 
|  | 546 | if (table[i].flags & SYM_FLAG_VALID) { | 
|  | 547 |  | 
|  | 548 | use_it += WORKING_SET; | 
|  | 549 |  | 
|  | 550 | if (use_it >= valid) { | 
|  | 551 | table[i].flags |= SYM_FLAG_SAMPLED; | 
|  | 552 | use_it -= valid; | 
|  | 553 | } | 
|  | 554 | } | 
|  | 555 | if (table[i].flags & SYM_FLAG_SAMPLED) | 
|  | 556 | learn_symbol(table[i].sym, table[i].len); | 
|  | 557 | } | 
|  | 558 | } | 
|  | 559 |  | 
|  | 560 | /* replace a given token in all the valid symbols. Use the sampled symbols | 
|  | 561 | * to update the counts */ | 
|  | 562 | static void compress_symbols(unsigned char *str, int tlen, int idx) | 
|  | 563 | { | 
|  | 564 | int i, len, learn, size; | 
|  | 565 | unsigned char *p; | 
|  | 566 |  | 
|  | 567 | for (i = 0; i < cnt; i++) { | 
|  | 568 |  | 
|  | 569 | if (!(table[i].flags & SYM_FLAG_VALID)) continue; | 
|  | 570 |  | 
|  | 571 | len = table[i].len; | 
|  | 572 | learn = 0; | 
|  | 573 | p = table[i].sym; | 
|  | 574 |  | 
|  | 575 | do { | 
|  | 576 | /* find the token on the symbol */ | 
|  | 577 | p = (unsigned char *) strstr((char *) p, (char *) str); | 
|  | 578 | if (!p) break; | 
|  | 579 |  | 
|  | 580 | if (!learn) { | 
|  | 581 | /* if this symbol was used to count, decrease it */ | 
|  | 582 | if (table[i].flags & SYM_FLAG_SAMPLED) | 
|  | 583 | forget_symbol(table[i].sym, len); | 
|  | 584 | learn = 1; | 
|  | 585 | } | 
|  | 586 |  | 
|  | 587 | *p = idx; | 
|  | 588 | size = (len - (p - table[i].sym)) - tlen + 1; | 
|  | 589 | memmove(p + 1, p + tlen, size); | 
|  | 590 | p++; | 
|  | 591 | len -= tlen - 1; | 
|  | 592 |  | 
|  | 593 | } while (size >= tlen); | 
|  | 594 |  | 
|  | 595 | if(learn) { | 
|  | 596 | table[i].len = len; | 
|  | 597 | /* if this symbol was used to count, learn it again */ | 
|  | 598 | if(table[i].flags & SYM_FLAG_SAMPLED) | 
|  | 599 | learn_symbol(table[i].sym, len); | 
|  | 600 | } | 
|  | 601 | } | 
|  | 602 | } | 
|  | 603 |  | 
|  | 604 | /* search the token with the maximum profit */ | 
|  | 605 | static struct token *find_best_token(void) | 
|  | 606 | { | 
|  | 607 | struct token *ptr,*best,*head; | 
|  | 608 | int bestprofit; | 
|  | 609 |  | 
|  | 610 | bestprofit=-10000; | 
|  | 611 |  | 
|  | 612 | /* failsafe: if the "good" list is empty search from the "bad" list */ | 
|  | 613 | if(good_head.right == &good_head) head = &bad_head; | 
|  | 614 | else head = &good_head; | 
|  | 615 |  | 
|  | 616 | ptr = head->right; | 
|  | 617 | best = NULL; | 
|  | 618 | while (ptr != head) { | 
|  | 619 | if (ptr->profit > bestprofit) { | 
|  | 620 | bestprofit = ptr->profit; | 
|  | 621 | best = ptr; | 
|  | 622 | } | 
|  | 623 | ptr = ptr->right; | 
|  | 624 | } | 
|  | 625 |  | 
|  | 626 | return best; | 
|  | 627 | } | 
|  | 628 |  | 
|  | 629 | /* this is the core of the algorithm: calculate the "best" table */ | 
|  | 630 | static void optimize_result(void) | 
|  | 631 | { | 
|  | 632 | struct token *best; | 
|  | 633 | int i; | 
|  | 634 |  | 
|  | 635 | /* using the '\0' symbol last allows compress_symbols to use standard | 
|  | 636 | * fast string functions */ | 
|  | 637 | for (i = 255; i >= 0; i--) { | 
|  | 638 |  | 
|  | 639 | /* if this table slot is empty (it is not used by an actual | 
|  | 640 | * original char code */ | 
|  | 641 | if (!best_table_len[i]) { | 
|  | 642 |  | 
|  | 643 | /* find the token with the breates profit value */ | 
|  | 644 | best = find_best_token(); | 
|  | 645 |  | 
|  | 646 | /* place it in the "best" table */ | 
|  | 647 | best_table_len[i] = best->len; | 
|  | 648 | memcpy(best_table[i], best->data, best_table_len[i]); | 
|  | 649 | /* zero terminate the token so that we can use strstr | 
|  | 650 | in compress_symbols */ | 
|  | 651 | best_table[i][best_table_len[i]]='\0'; | 
|  | 652 |  | 
|  | 653 | /* replace this token in all the valid symbols */ | 
|  | 654 | compress_symbols(best_table[i], best_table_len[i], i); | 
|  | 655 | } | 
|  | 656 | } | 
|  | 657 | } | 
|  | 658 |  | 
|  | 659 | /* start by placing the symbols that are actually used on the table */ | 
|  | 660 | static void insert_real_symbols_in_table(void) | 
|  | 661 | { | 
|  | 662 | int i, j, c; | 
|  | 663 |  | 
|  | 664 | memset(best_table, 0, sizeof(best_table)); | 
|  | 665 | memset(best_table_len, 0, sizeof(best_table_len)); | 
|  | 666 |  | 
|  | 667 | for (i = 0; i < cnt; i++) { | 
|  | 668 | if (table[i].flags & SYM_FLAG_VALID) { | 
|  | 669 | for (j = 0; j < table[i].len; j++) { | 
|  | 670 | c = table[i].sym[j]; | 
|  | 671 | best_table[c][0]=c; | 
|  | 672 | best_table_len[c]=1; | 
|  | 673 | } | 
|  | 674 | } | 
|  | 675 | } | 
|  | 676 | } | 
|  | 677 |  | 
|  | 678 | static void optimize_token_table(void) | 
|  | 679 | { | 
|  | 680 | memset(hash_table, 0, sizeof(hash_table)); | 
|  | 681 |  | 
|  | 682 | good_head.left = &good_head; | 
|  | 683 | good_head.right = &good_head; | 
|  | 684 |  | 
|  | 685 | bad_head.left = &bad_head; | 
|  | 686 | bad_head.right = &bad_head; | 
|  | 687 |  | 
|  | 688 | build_initial_tok_table(); | 
|  | 689 |  | 
|  | 690 | insert_real_symbols_in_table(); | 
|  | 691 |  | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 692 | /* When valid symbol is not registered, exit to error */ | 
|  | 693 | if (good_head.left == good_head.right && | 
|  | 694 | bad_head.left == bad_head.right) { | 
|  | 695 | fprintf(stderr, "No valid symbol.\n"); | 
|  | 696 | exit(1); | 
|  | 697 | } | 
|  | 698 |  | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 699 | optimize_result(); | 
|  | 700 | } | 
|  | 701 |  | 
|  | 702 |  | 
|  | 703 | int | 
|  | 704 | main(int argc, char **argv) | 
|  | 705 | { | 
| Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 706 | if (argc >= 2) { | 
|  | 707 | int i; | 
|  | 708 | for (i = 1; i < argc; i++) { | 
|  | 709 | if(strcmp(argv[i], "--all-symbols") == 0) | 
|  | 710 | all_symbols = 1; | 
|  | 711 | else if (strncmp(argv[i], "--symbol-prefix=", 16) == 0) { | 
|  | 712 | char *p = &argv[i][16]; | 
|  | 713 | /* skip quote */ | 
|  | 714 | if ((*p == '"' && *(p+2) == '"') || (*p == '\'' && *(p+2) == '\'')) | 
|  | 715 | p++; | 
|  | 716 | symbol_prefix_char = *p; | 
|  | 717 | } else | 
|  | 718 | usage(); | 
|  | 719 | } | 
|  | 720 | } else if (argc != 1) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 721 | usage(); | 
|  | 722 |  | 
|  | 723 | read_map(stdin); | 
|  | 724 | optimize_token_table(); | 
|  | 725 | write_src(); | 
|  | 726 |  | 
|  | 727 | return 0; | 
|  | 728 | } |