我在我的探查器中闲逛一段时间试图弄清楚如何加速一个常见的日志解析器,这个解析器在日期解析时遇到瓶颈,我尝试了各种算法来加快速度.
我尝试过的对我来说最快的东西也是迄今为止最具可读性,但可能是非标准的C.
这在GCC,icc和我真正古老而挑剔的SGI编译器中运行良好.因为它是一个非常易读的优化,它不是我想要的吗?
static int parseMonth(const char *input) { int rv=-1; int inputInt=0; int i=0; for(i=0; i<4 && input[i]; i++) { inputInt = (inputInt << 8) | input[i]; } switch(inputInt) { case 'Jan/': rv=0; break; case 'Feb/': rv=1; break; case 'Mar/': rv=2; break; case 'Apr/': rv=3; break; case 'May/': rv=4; break; case 'Jun/': rv=5; break; case 'Jul/': rv=6; break; case 'Aug/': rv=7; break; case 'Sep/': rv=8; break; case 'Oct/': rv=9; break; case 'Nov/': rv=10; break; case 'Dec/': rv=11; break; } return rv; }
Jonathan Lef.. 22
Solaris 10 - SPARC - SUN编译器.
测试代码:
#includestatic int parseMonth(const char *input) { int rv=-1; int inputInt=0; int i=0; for(i=0; i<4 && input[i]; i++) { inputInt = (inputInt << 8) | input[i]; } switch(inputInt) { case 'Jan/': rv=0; break; case 'Feb/': rv=1; break; case 'Mar/': rv=2; break; case 'Apr/': rv=3; break; case 'May/': rv=4; break; case 'Jun/': rv=5; break; case 'Jul/': rv=6; break; case 'Aug/': rv=7; break; case 'Sep/': rv=8; break; case 'Oct/': rv=9; break; case 'Nov/': rv=10; break; case 'Dec/': rv=11; break; } return rv; } static const struct { char *data; int result; } test_case[] = { { "Jan/", 0 }, { "Feb/", 1 }, { "Mar/", 2 }, { "Apr/", 3 }, { "May/", 4 }, { "Jun/", 5 }, { "Jul/", 6 }, { "Aug/", 7 }, { "Sep/", 8 }, { "Oct/", 9 }, { "Nov/", 10 }, { "Dec/", 11 }, { "aJ/n", -1 }, }; #define DIM(x) (sizeof(x)/sizeof(*(x))) int main(void) { size_t i; int result; for (i = 0; i < DIM(test_case); i++) { result = parseMonth(test_case[i].data); if (result != test_case[i].result) printf("!! FAIL !! %s (got %d, wanted %d)\n", test_case[i].data, result, test_case[i].result); } return(0); }
结果(GCC 3.4.2和Sun):
$ gcc -O xx.c -o xx xx.c:14:14: warning: multi-character character constant xx.c:15:14: warning: multi-character character constant xx.c:16:14: warning: multi-character character constant xx.c:17:14: warning: multi-character character constant xx.c:18:14: warning: multi-character character constant xx.c:19:14: warning: multi-character character constant xx.c:20:14: warning: multi-character character constant xx.c:21:14: warning: multi-character character constant xx.c:22:14: warning: multi-character character constant xx.c:23:14: warning: multi-character character constant xx.c:24:14: warning: multi-character character constant xx.c:25:14: warning: multi-character character constant $ ./xx $ cc -o xx xx.c $ ./xx !! FAIL !! Jan/ (got -1, wanted 0) !! FAIL !! Feb/ (got -1, wanted 1) !! FAIL !! Mar/ (got -1, wanted 2) !! FAIL !! Apr/ (got -1, wanted 3) !! FAIL !! May/ (got -1, wanted 4) !! FAIL !! Jun/ (got -1, wanted 5) !! FAIL !! Jul/ (got -1, wanted 6) !! FAIL !! Aug/ (got -1, wanted 7) !! FAIL !! Sep/ (got -1, wanted 8) !! FAIL !! Oct/ (got -1, wanted 9) !! FAIL !! Nov/ (got -1, wanted 10) !! FAIL !! Dec/ (got -1, wanted 11) $
请注意,最后一个测试用例仍然通过 - 也就是说,它生成了-1.
这是一个修订的 - 更详细的 - parseMonth()版本,它在GCC和Sun C编译器下的工作原理相同:
#include/* MONTH_CODE("Jan/") does not reduce to an integer constant */ #define MONTH_CODE(x) ((((((x[0]<<8)|x[1])<<8)|x[2])<<8)|x[3]) #define MONTH_JAN (((((('J'<<8)|'a')<<8)|'n')<<8)|'/') #define MONTH_FEB (((((('F'<<8)|'e')<<8)|'b')<<8)|'/') #define MONTH_MAR (((((('M'<<8)|'a')<<8)|'r')<<8)|'/') #define MONTH_APR (((((('A'<<8)|'p')<<8)|'r')<<8)|'/') #define MONTH_MAY (((((('M'<<8)|'a')<<8)|'y')<<8)|'/') #define MONTH_JUN (((((('J'<<8)|'u')<<8)|'n')<<8)|'/') #define MONTH_JUL (((((('J'<<8)|'u')<<8)|'l')<<8)|'/') #define MONTH_AUG (((((('A'<<8)|'u')<<8)|'g')<<8)|'/') #define MONTH_SEP (((((('S'<<8)|'e')<<8)|'p')<<8)|'/') #define MONTH_OCT (((((('O'<<8)|'c')<<8)|'t')<<8)|'/') #define MONTH_NOV (((((('N'<<8)|'o')<<8)|'v')<<8)|'/') #define MONTH_DEC (((((('D'<<8)|'e')<<8)|'c')<<8)|'/') static int parseMonth(const char *input) { int rv=-1; int inputInt=0; int i=0; for(i=0; i<4 && input[i]; i++) { inputInt = (inputInt << 8) | input[i]; } switch(inputInt) { case MONTH_JAN: rv=0; break; case MONTH_FEB: rv=1; break; case MONTH_MAR: rv=2; break; case MONTH_APR: rv=3; break; case MONTH_MAY: rv=4; break; case MONTH_JUN: rv=5; break; case MONTH_JUL: rv=6; break; case MONTH_AUG: rv=7; break; case MONTH_SEP: rv=8; break; case MONTH_OCT: rv=9; break; case MONTH_NOV: rv=10; break; case MONTH_DEC: rv=11; break; } return rv; } static const struct { char *data; int result; } test_case[] = { { "Jan/", 0 }, { "Feb/", 1 }, { "Mar/", 2 }, { "Apr/", 3 }, { "May/", 4 }, { "Jun/", 5 }, { "Jul/", 6 }, { "Aug/", 7 }, { "Sep/", 8 }, { "Oct/", 9 }, { "Nov/", 10 }, { "Dec/", 11 }, { "aJ/n", -1 }, { "/naJ", -1 }, }; #define DIM(x) (sizeof(x)/sizeof(*(x))) int main(void) { size_t i; int result; for (i = 0; i < DIM(test_case); i++) { result = parseMonth(test_case[i].data); if (result != test_case[i].result) printf("!! FAIL !! %s (got %d, wanted %d)\n", test_case[i].data, result, test_case[i].result); } return(0); }
我想使用MONTH_CODE(),但编译器没有合作.
Solaris 10 - SPARC - SUN编译器.
测试代码:
#includestatic int parseMonth(const char *input) { int rv=-1; int inputInt=0; int i=0; for(i=0; i<4 && input[i]; i++) { inputInt = (inputInt << 8) | input[i]; } switch(inputInt) { case 'Jan/': rv=0; break; case 'Feb/': rv=1; break; case 'Mar/': rv=2; break; case 'Apr/': rv=3; break; case 'May/': rv=4; break; case 'Jun/': rv=5; break; case 'Jul/': rv=6; break; case 'Aug/': rv=7; break; case 'Sep/': rv=8; break; case 'Oct/': rv=9; break; case 'Nov/': rv=10; break; case 'Dec/': rv=11; break; } return rv; } static const struct { char *data; int result; } test_case[] = { { "Jan/", 0 }, { "Feb/", 1 }, { "Mar/", 2 }, { "Apr/", 3 }, { "May/", 4 }, { "Jun/", 5 }, { "Jul/", 6 }, { "Aug/", 7 }, { "Sep/", 8 }, { "Oct/", 9 }, { "Nov/", 10 }, { "Dec/", 11 }, { "aJ/n", -1 }, }; #define DIM(x) (sizeof(x)/sizeof(*(x))) int main(void) { size_t i; int result; for (i = 0; i < DIM(test_case); i++) { result = parseMonth(test_case[i].data); if (result != test_case[i].result) printf("!! FAIL !! %s (got %d, wanted %d)\n", test_case[i].data, result, test_case[i].result); } return(0); }
结果(GCC 3.4.2和Sun):
$ gcc -O xx.c -o xx xx.c:14:14: warning: multi-character character constant xx.c:15:14: warning: multi-character character constant xx.c:16:14: warning: multi-character character constant xx.c:17:14: warning: multi-character character constant xx.c:18:14: warning: multi-character character constant xx.c:19:14: warning: multi-character character constant xx.c:20:14: warning: multi-character character constant xx.c:21:14: warning: multi-character character constant xx.c:22:14: warning: multi-character character constant xx.c:23:14: warning: multi-character character constant xx.c:24:14: warning: multi-character character constant xx.c:25:14: warning: multi-character character constant $ ./xx $ cc -o xx xx.c $ ./xx !! FAIL !! Jan/ (got -1, wanted 0) !! FAIL !! Feb/ (got -1, wanted 1) !! FAIL !! Mar/ (got -1, wanted 2) !! FAIL !! Apr/ (got -1, wanted 3) !! FAIL !! May/ (got -1, wanted 4) !! FAIL !! Jun/ (got -1, wanted 5) !! FAIL !! Jul/ (got -1, wanted 6) !! FAIL !! Aug/ (got -1, wanted 7) !! FAIL !! Sep/ (got -1, wanted 8) !! FAIL !! Oct/ (got -1, wanted 9) !! FAIL !! Nov/ (got -1, wanted 10) !! FAIL !! Dec/ (got -1, wanted 11) $
请注意,最后一个测试用例仍然通过 - 也就是说,它生成了-1.
这是一个修订的 - 更详细的 - parseMonth()版本,它在GCC和Sun C编译器下的工作原理相同:
#include/* MONTH_CODE("Jan/") does not reduce to an integer constant */ #define MONTH_CODE(x) ((((((x[0]<<8)|x[1])<<8)|x[2])<<8)|x[3]) #define MONTH_JAN (((((('J'<<8)|'a')<<8)|'n')<<8)|'/') #define MONTH_FEB (((((('F'<<8)|'e')<<8)|'b')<<8)|'/') #define MONTH_MAR (((((('M'<<8)|'a')<<8)|'r')<<8)|'/') #define MONTH_APR (((((('A'<<8)|'p')<<8)|'r')<<8)|'/') #define MONTH_MAY (((((('M'<<8)|'a')<<8)|'y')<<8)|'/') #define MONTH_JUN (((((('J'<<8)|'u')<<8)|'n')<<8)|'/') #define MONTH_JUL (((((('J'<<8)|'u')<<8)|'l')<<8)|'/') #define MONTH_AUG (((((('A'<<8)|'u')<<8)|'g')<<8)|'/') #define MONTH_SEP (((((('S'<<8)|'e')<<8)|'p')<<8)|'/') #define MONTH_OCT (((((('O'<<8)|'c')<<8)|'t')<<8)|'/') #define MONTH_NOV (((((('N'<<8)|'o')<<8)|'v')<<8)|'/') #define MONTH_DEC (((((('D'<<8)|'e')<<8)|'c')<<8)|'/') static int parseMonth(const char *input) { int rv=-1; int inputInt=0; int i=0; for(i=0; i<4 && input[i]; i++) { inputInt = (inputInt << 8) | input[i]; } switch(inputInt) { case MONTH_JAN: rv=0; break; case MONTH_FEB: rv=1; break; case MONTH_MAR: rv=2; break; case MONTH_APR: rv=3; break; case MONTH_MAY: rv=4; break; case MONTH_JUN: rv=5; break; case MONTH_JUL: rv=6; break; case MONTH_AUG: rv=7; break; case MONTH_SEP: rv=8; break; case MONTH_OCT: rv=9; break; case MONTH_NOV: rv=10; break; case MONTH_DEC: rv=11; break; } return rv; } static const struct { char *data; int result; } test_case[] = { { "Jan/", 0 }, { "Feb/", 1 }, { "Mar/", 2 }, { "Apr/", 3 }, { "May/", 4 }, { "Jun/", 5 }, { "Jul/", 6 }, { "Aug/", 7 }, { "Sep/", 8 }, { "Oct/", 9 }, { "Nov/", 10 }, { "Dec/", 11 }, { "aJ/n", -1 }, { "/naJ", -1 }, }; #define DIM(x) (sizeof(x)/sizeof(*(x))) int main(void) { size_t i; int result; for (i = 0; i < DIM(test_case); i++) { result = parseMonth(test_case[i].data); if (result != test_case[i].result) printf("!! FAIL !! %s (got %d, wanted %d)\n", test_case[i].data, result, test_case[i].result); } return(0); }
我想使用MONTH_CODE(),但编译器没有合作.
if ( !input[0] || !input[1] || !input[2] || input[3] != '/' ) return -1; switch ( input[0] ) { case 'F': return 1; // Feb case 'S': return 8; // Sep case 'O': return 9; // Oct case 'N': return 10; // Nov case 'D': return 11; // Dec; case 'A': return input[1] == 'p' ? 3 : 7; // Apr, Aug case 'M': return input[2] == 'r' ? 2 : 4; // Mar, May default: return input[1] == 'a' ? 0 : (input[2] == 'n' ? 5 : 6); // Jan, Jun, Jul }
稍微不那么可读,而不是那么多验证,但可能更快,不是吗?
你只是计算这四个字符的哈希值.为什么不预先定义一些以相同方式计算哈希的整数常量并使用它们呢?相同的可读性,您不依赖于编译器的任何特定于实现的特性.
uint32_t MONTH_JAN = 'J' << 24 + 'a' << 16 + 'n' << 8 + '/'; uint32_t MONTH_FEB = 'F' << 24 + 'e' << 16 + 'b' << 8 + '/'; ... static uint32_t parseMonth(const char *input) { uint32_t rv=-1; uint32_t inputInt=0; int i=0; for(i=0; i<4 && input[i]; i++) { inputInt = (inputInt << 8) | (input[i] & 0x7f); // clear top bit } switch(inputInt) { case MONTH_JAN: rv=0; break; case MONTH_FEB: rv=1; break; ... } return rv; }
我只知道C标准对此有何看法(C99):
包含多个字符(例如,'ab')的整数字符常量的值,或包含未映射到单字节执行字符的字符或转义序列的值是实现定义的.如果整数字符常量包含单个字符或转义序列,则其值是当char类型的对象(单值字符或转义序列的值)转换为int类型时生成的值.
(6.4.4.4/10取自草案)
所以它是实现定义的.意味着它无法保证在任何地方都能正常工作,但行为必须由实现记录.例如,如果int
在特定实现中只有16位宽,'Jan/'
则不能再像你想要的那样表示它(char
必须至少为8位,而字符文字总是类型int
).
char *months = "Jan/Feb/Mar/Apr/May/Jun/Jul/Aug/Sep/Oct/Nov/Dec/"; char *p = strnstr(months, input, 4); return p ? (p - months) / 4 : -1;