/*- * DBSQL - A SQL database engine. * * Copyright (C) 2007-2008 The DBSQL Group, Inc. - All rights reserved. * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * There are special exceptions to the terms and conditions of the GPL as it * is applied to this software. View the full text of the exception in file * LICENSE_EXCEPTIONS in the directory of this software distribution. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ /* * An tokenizer for SQL * * This file contains C code that splits an SQL input string up into * individual tokens and sends those tokens one-by-one over to the * parser for analysis. */ #include "dbsql_config.h" #ifndef NO_SYSTEM_INCLUDES #include #include #endif #include "dbsql_int.h" /* * All the keywords of the SQL language are stored as in a hash * table composed of instances of the following structure. */ typedef struct keyword keyword_t; struct keyword { char *name; /* The keyword name */ u_int8_t token_type; /* token_t value for this keyword */ u_int8_t len; /* Length of this keyword */ u_int8_t inext; /* Index in sql_tokens_table[] of next with same hash */ }; /* * These are the keywords */ static keyword_t sql_tokens_table[] = { { "ABORT", TK_ABORT, }, { "AFTER", TK_AFTER, }, { "ALL", TK_ALL, }, { "AND", TK_AND, }, { "AS", TK_AS, }, { "ASC", TK_ASC, }, { "ATTACH", TK_ATTACH, }, { "BEFORE", TK_BEFORE, }, { "BEGIN", TK_BEGIN, }, { "BETWEEN", TK_BETWEEN, }, { "BY", TK_BY, }, { "CASCADE", TK_CASCADE, }, { "CASE", TK_CASE, }, { "CHECK", TK_CHECK, }, { "CLUSTER", TK_CLUSTER, }, { "COLLATE", TK_COLLATE, }, { "COMMIT", TK_COMMIT, }, { "CONFLICT", TK_CONFLICT, }, { "CONSTRAINT", TK_CONSTRAINT, }, { "COPY", TK_COPY, }, { "CREATE", TK_CREATE, }, { "CROSS", TK_JOIN_KW, }, { "DATABASE", TK_DATABASE, }, { "DEFAULT", TK_DEFAULT, }, { "DEFERRED", TK_DEFERRED, }, { "DEFERRABLE", TK_DEFERRABLE, }, { "DELETE", TK_DELETE, }, { "DELIMITERS", TK_DELIMITERS, }, { "DESC", TK_DESC, }, { "DETACH", TK_DETACH, }, { "DISTINCT", TK_DISTINCT, }, { "DROP", TK_DROP, }, { "END", TK_END, }, { "EACH", TK_EACH, }, { "ELSE", TK_ELSE, }, { "EXCEPT", TK_EXCEPT, }, { "EXPLAIN", TK_EXPLAIN, }, { "FAIL", TK_FAIL, }, { "FOR", TK_FOR, }, { "FOREIGN", TK_FOREIGN, }, { "FROM", TK_FROM, }, { "FULL", TK_JOIN_KW, }, { "GLOB", TK_GLOB, }, { "GROUP", TK_GROUP, }, { "HAVING", TK_HAVING, }, { "IGNORE", TK_IGNORE, }, { "IMMEDIATE", TK_IMMEDIATE, }, { "IN", TK_IN, }, { "INDEX", TK_INDEX, }, { "INITIALLY", TK_INITIALLY, }, { "INNER", TK_JOIN_KW, }, { "INSERT", TK_INSERT, }, { "INSTEAD", TK_INSTEAD, }, { "INTERSECT", TK_INTERSECT, }, { "INTO", TK_INTO, }, { "IS", TK_IS, }, { "ISNULL", TK_ISNULL, }, { "JOIN", TK_JOIN, }, { "KEY", TK_KEY, }, { "LEFT", TK_JOIN_KW, }, { "LIKE", TK_LIKE, }, { "LIMIT", TK_LIMIT, }, { "MATCH", TK_MATCH, }, { "NATURAL", TK_JOIN_KW, }, { "NOT", TK_NOT, }, { "NOTNULL", TK_NOTNULL, }, { "NULL", TK_NULL, }, { "OF", TK_OF, }, { "OFFSET", TK_OFFSET, }, { "ON", TK_ON, }, { "OR", TK_OR, }, { "ORDER", TK_ORDER, }, { "OUTER", TK_JOIN_KW, }, { "PRAGMA", TK_PRAGMA, }, { "PRIMARY", TK_PRIMARY, }, { "RAISE", TK_RAISE, }, { "REFERENCES", TK_REFERENCES, }, { "REPLACE", TK_REPLACE, }, { "RESTRICT", TK_RESTRICT, }, { "RIGHT", TK_JOIN_KW, }, { "ROLLBACK", TK_ROLLBACK, }, { "ROW", TK_ROW, }, { "SELECT", TK_SELECT, }, { "SET", TK_SET, }, { "STATEMENT", TK_STATEMENT, }, { "TABLE", TK_TABLE, }, { "TEMP", TK_TEMP, }, { "TEMPORARY", TK_TEMP, }, { "THEN", TK_THEN, }, { "TRANSACTION", TK_TRANSACTION, }, { "TRIGGER", TK_TRIGGER, }, { "UNION", TK_UNION, }, { "UNIQUE", TK_UNIQUE, }, { "UPDATE", TK_UPDATE, }, { "USING", TK_USING, }, { "VACUUM", TK_VACUUM, }, { "VALUES", TK_VALUES, }, { "VIEW", TK_VIEW, }, { "WHEN", TK_WHEN, }, { "WHERE", TK_WHERE, }, }; /* * This is the hash table */ #define KEY_HASH_SIZE 101 static u_int8_t ai_table[KEY_HASH_SIZE]; /* * __get_keyword_code -- * This function looks up an identifier to determine if it is a * keyword. If it is a keyword, the token code of that keyword is * returned. If the input is not a keyword, TK_ID is returned. * * PUBLIC: int get_keyword_code __P((const char *, int)); */ int __get_keyword_code(z, n) const char *z; int n; { int h, i; int nk; keyword_t *p; static char need_init = 1; if (need_init) { /* TODO: beginning of what used to be mutex'ed */ /* Initialize the keyword hash table */ need_init = 0; nk = sizeof(sql_tokens_table) / sizeof(sql_tokens_table[0]); for (i = 0; i < nk; i++) { sql_tokens_table[i].len = strlen(sql_tokens_table[i].name); h = __hash_ignore_case(sql_tokens_table[i].name, sql_tokens_table[i].len); h %= KEY_HASH_SIZE; sql_tokens_table[i].inext = ai_table[h]; ai_table[h] = i+1; } } /* TODO: end of what used to be mutex'ed */ h = __hash_ignore_case(z, n) % KEY_HASH_SIZE; for (i = ai_table[h]; i; i = p->inext) { p = &sql_tokens_table[i-1]; if (p->len == n && strncasecmp(p->name, z, n) == 0) { return p->token_type; } } return TK_ID; } /* * If X is a character that can be used in an identifier and * X&0x80==0 then id_char_p[X] will be 1. If X&0x80==0x80 then * X is always an identifier character. (Hence all UTF-8 * characters can be part of an identifier). id_char_p[X] will * be 0 for every character in the lower 128 ASCII characters * that cannot be used as part of an identifier. * * In this implementation, an identifier can be a string of * alphabetic characters, digits, and "_" plus any character * with the high-order bit set. The latter rule means that * any sequence of UTF-8 characters or characters taken from * an extended ISO8859 character set can form an identifier. */ static const char id_char_p[] = { /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ }; /* * __get_token -- * Return the length of the token that begins at z[0]. * Store the token type in *token_type before returning. * * STATIC: static int __get_token __P((const unsigned char *, int *)); */ static int __get_token(z, token_type) const unsigned char *z; int *token_type; { int i, delim; switch(*z) { case ' ': /* FALLTHROUGH */ case '\t': /* FALLTHROUGH */ case '\n': /* FALLTHROUGH */ case '\f': /* FALLTHROUGH */ case '\r': /* FALLTHROUGH */ i = 1; while (isspace(z[i])) { i++; } *token_type = TK_SPACE; return i; break; case '-': if (z[1] == '-') { i = 2; while (z[i] && z[i] != '\n') { i++; } *token_type = TK_COMMENT; return i; } *token_type = TK_MINUS; return 1; break; case '(': *token_type = TK_LP; return 1; break; case ')': *token_type = TK_RP; return 1; break; case ';': *token_type = TK_SEMI; return 1; break; case '+': *token_type = TK_PLUS; return 1; break; case '*': *token_type = TK_STAR; return 1; break; case '/': if (z[1] != '*' || z[2] == 0) { *token_type = TK_SLASH; return 1; } i = 3; while (z[i] && (z[i]!='/' || z[i-1]!='*')) { i++; } if (z[i]) i++; *token_type = TK_COMMENT; return i; break; case '%': *token_type = TK_REM; return 1; break; case '=': *token_type = TK_EQ; return 1 + (z[1] == '='); break; case '<': if (z[1] == '=') { *token_type = TK_LE; return 2; } else if (z[1] == '>') { *token_type = TK_NE; return 2; } else if (z[1] == '<') { *token_type = TK_LSHIFT; return 2; } else { *token_type = TK_LT; return 1; } break; case '>': if (z[1] == '=') { *token_type = TK_GE; return 2; } else if (z[1] == '>') { *token_type = TK_RSHIFT; return 2; } else { *token_type = TK_GT; return 1; } break; case '!': if (z[1] != '=') { *token_type = TK_ILLEGAL; return 2; } else { *token_type = TK_NE; return 2; } break; case '|': if (z[1] != '|') { *token_type = TK_BITOR; return 1; } else { *token_type = TK_CONCAT; return 2; } break; case ',': *token_type = TK_COMMA; return 1; break; case '&': *token_type = TK_BITAND; return 1; break; case '~': *token_type = TK_BITNOT; return 1; break; case '\'': /* FALLTHROUGH */ case '"': delim = z[0]; for (i = 1; z[i]; i++) { if (z[i] == delim) { if (z[i+1] == delim) { i++; } else { break; } } } if (z[i]) i++; *token_type = TK_STRING; return i; case '.': *token_type = TK_DOT; return 1; break; case '0': /* FALLTHROUGH */ case '1': /* FALLTHROUGH */ case '2': /* FALLTHROUGH */ case '3': /* FALLTHROUGH */ case '4': /* FALLTHROUGH */ case '5': /* FALLTHROUGH */ case '6': /* FALLTHROUGH */ case '7': /* FALLTHROUGH */ case '8': /* FALLTHROUGH */ case '9': *token_type = TK_INTEGER; i = 1; while (isdigit(z[i])) { i++; } if (z[i] == '.' && isdigit(z[i+1])) { i += 2; while(isdigit(z[i])) { i++; } *token_type = TK_FLOAT; } if ((z[i] == 'e' || z[i] == 'E') && (isdigit(z[i+1]) || ((z[i+1] == '+' || z[i+1] == '-') && isdigit(z[i+2])))) { i += 2; while(isdigit(z[i])) { i++; } *token_type = TK_FLOAT; } return i; break; case '[': i = 1; while (z[i] && z[i-1] != ']') { i++; } *token_type = TK_ID; return i; break; case '?': *token_type = TK_VARIABLE; return 1; break; default: if ((*z & 0x80) == 0 && !id_char_p[*z]) { break; } i = 1; while((z[i] & 0x80)!=0 || id_char_p[z[i]]) { i++; } *token_type = __get_keyword_code((char*)z, i); return i; break; } *token_type = TK_ILLEGAL; return 1; } /* * __run_sql_parser -- * Run the parser on the given SQL string. The parser structure is * passed in. A DBSQL_ status code is returned. * * PUBLIC: int __run_sql_parser __P((parser_t *, const char *, char **)); */ /*TODO: REMOVE THIS If an error occurs * and pzErrMsg!=NULL then an error message might be written into * memory obtained from malloc() and *pzErrMsg made to point to that * error message. Or maybe not. */ int __run_sql_parser(parser, sql, err_msgs) parser_t *parser; const char *sql; char **err_msgs; { int nerr = 0; int i; void *engine; int token_type; int last_token_parsed = -1; DBSQL *dbp = parser->db; extern void *__sql_parser_alloc(DBSQL *, int(*)(DBSQL*,size_t,void *)); extern void __sql_parser_free(DBSQL *, void *, void(*)(DBSQL *,void*)); extern int __sql_parser(void*, int, token_t, parser_t*); dbp->flags &= ~DBSQL_Interrupt; parser->rc = DBSQL_SUCCESS; i = 0; engine = __sql_parser_alloc(dbp, __dbsql_malloc); if (engine == 0) { __str_append(err_msgs, "out of memory", (char*)0); return 1; } parser->sLastToken.dyn = 0; parser->zTail = sql; while (parser->rc == DBSQL_SUCCESS && sql[i] != 0) { DBSQL_ASSERT(i >= 0); parser->sLastToken.z = &sql[i]; DBSQL_ASSERT(parser->sLastToken.dyn == 0); parser->sLastToken.n = __get_token((unsigned char*)&sql[i], &token_type); i += parser->sLastToken.n; switch (token_type) { case TK_SPACE: /* FALLTHROUGH */ case TK_COMMENT: if ((dbp->flags & DBSQL_Interrupt) != 0) { parser->rc = DBSQL_INTERRUPTED; __str_append(err_msgs, "interrupt", (char*)0); goto abort_parse; } break; case TK_ILLEGAL: __str_nappend(err_msgs, "unrecognized token: \"", -1, parser->sLastToken.z, parser->sLastToken.n, "\"", 1, NULL); nerr++; goto abort_parse; break; case TK_SEMI: parser->zTail = &sql[i]; /* FALLTHROUGH */ default: __sql_parser(engine, token_type, parser->sLastToken, parser); last_token_parsed = token_type; if (parser->rc != DBSQL_SUCCESS) { goto abort_parse; } break; } } abort_parse: if (sql[i] == 0 && nerr == 0 && parser->rc == DBSQL_SUCCESS) { if (last_token_parsed != TK_SEMI) { __sql_parser(engine, TK_SEMI, parser->sLastToken, parser); parser->zTail = &sql[i]; } __sql_parser(engine, 0, parser->sLastToken, parser); } __sql_parser_free(dbp, engine, __dbsql_free); if (parser->rc != DBSQL_SUCCESS && parser->rc != DBSQL_DONE && parser->zErrMsg == 0) { __str_append(&parser->zErrMsg, dbsql_strerror(parser->rc), (char*)0); } if (parser->zErrMsg) { if (err_msgs && *err_msgs == 0) { *err_msgs = parser->zErrMsg; } else { __dbsql_free(dbp, parser->zErrMsg); } parser->zErrMsg = 0; if (!nerr) nerr++; } if (parser->pVdbe && (parser->useCallback || parser->nErr > 0)) { __vdbe_delete(parser->pVdbe); parser->pVdbe = 0; } if (parser->pNewTable) { __vdbe_delete_table(parser->db, parser->pNewTable); parser->pNewTable = 0; } if (parser->pNewTrigger) { __vdbe_delete_trigger(parser->pNewTrigger); parser->pNewTrigger = 0; } if (nerr > 0 && (parser->rc == DBSQL_SUCCESS || parser->rc == DBSQL_DONE)) { parser->rc = DBSQL_ERROR; } return nerr; } /* * Token types used by the dbsql_complete_stmt() routine. See the header * comments on that procedure for additional information. */ #define tkEXPLAIN 0 #define tkCREATE 1 #define tkTEMP 2 #define tkTRIGGER 3 #define tkEND 4 #define tkSEMI 5 #define tkWS 6 #define tkOTHER 7 /* * dbsql_complete_stmt -- * * Return TRUE if the given SQL string ends in a semicolon. * * Special handling is require for CREATE TRIGGER statements. * Whenever the CREATE TRIGGER keywords are seen, the statement * must end with ";END;". * * This implementation uses a state machine with 7 states: * * (0) START At the beginning or end of an SQL statement. This routine * returns 1 if it ends in the START state and 0 if it ends * in any other state. * * (1) EXPLAIN The keyword EXPLAIN has been seen at the beginning of * a statement. * * (2) CREATE The keyword CREATE has been seen at the beginning of a * statement, possibly preceeded by EXPLAIN and/or followed by * TEMP or TEMPORARY * * (3) NORMAL We are in the middle of statement which ends with a single * semicolon. * * (4) TRIGGER We are in the middle of a trigger definition that must be * ended by a semicolon, the keyword END, and another * semicolon. * * (5) SEMI We've seen the first semicolon in the ";END;" that occurs at * the end of a trigger definition. * * (6) END We've seen the ";END" of the ";END;" that occurs at the end * of a trigger difinition. * * Transitions between states above are determined by tokens extracted * from the input. The following tokens are significant: * * (0) tkEXPLAIN The "explain" keyword. * (1) tkCREATE The "create" keyword. * (2) tkTEMP The "temp" or "temporary" keyword. * (3) tkTRIGGER The "trigger" keyword. * (4) tkEND The "end" keyword. * (5) tkSEMI A semicolon. * (6) tkWS Whitespace * (7) tkOTHER Any other SQL token. * * Whitespace never causes a state transition and is always ignored. * * EXTERN: int dbsql_complete_stmt __P((const char *)); * */ int dbsql_complete_stmt(sql) const char *sql; { u_int8_t state = 0; /* Current state, using values from comment */ u_int8_t token; /* Value of the next token */ int c; /* * The following matrix defines the transition from one state to * another according to what token is seen. trans[state][token] * returns the next state. */ static const u_int8_t trans[7][8] = { /* Token: */ /* State: ** EXPLAIN CREATE TEMP TRIGGER END SEMI WS OTHER */ /* 0 START: */ { 1, 2, 3, 3, 3, 0, 0, 3, }, /* 1 EXPLAIN: */ { 3, 2, 3, 3, 3, 0, 1, 3, }, /* 2 CREATE: */ { 3, 3, 2, 4, 3, 0, 2, 3, }, /* 3 NORMAL: */ { 3, 3, 3, 3, 3, 0, 3, 3, }, /* 4 TRIGGER: */ { 4, 4, 4, 4, 4, 5, 4, 4, }, /* 5 SEMI: */ { 4, 4, 4, 4, 6, 5, 5, 4, }, /* 6 END: */ { 4, 4, 4, 4, 4, 0, 6, 4, }, }; while (*sql) { switch (*sql) { case ';': token = tkSEMI; break; case ' ': /* FALLTHROUGH */ case '\r': /* FALLTHROUGH */ case '\t': /* FALLTHROUGH */ case '\n': /* FALLTHROUGH */ case '\f': /* White space is ignored */ token = tkWS; break; case '/': /* C-style comments */ if (sql[1] != '*') { token = tkOTHER; break; } sql += 2; while (sql[0] && (sql[0] != '*' || sql[1] != '/')) { sql++; } if (sql[0] == 0) return 0; sql++; token = tkWS; break; case '-': /* SQL-style comments from "--" to end of line */ if (sql[1] != '-') { token = tkOTHER; break; } while (*sql && *sql != '\n') { sql++; } if (*sql == 0) return state == 0; token = tkWS; break; case '[': /* Microsoft-style identifiers in [...] */ sql++; while (*sql && *sql!=']') { sql++; } if (*sql == 0) return 0; token = tkOTHER; break; case '"': /* single- and double-quoted strings */ /* FALLTHROUGH */ case '\'': c = *sql; sql++; while (*sql && *sql != c) { sql++; } if (*sql == 0) return 0; token = tkOTHER; break; default: if (id_char_p[(u_int8_t)*sql]) { /* Keywords and unquoted identifiers */ int nid = 1; while (id_char_p[(u_int8_t)sql[nid]]) { nid++; } switch (*sql) { case 'c': /* FALLTHROUGH */ case 'C': if (nid == 6 && strncasecmp(sql, "create", 6) == 0) { token = tkCREATE; } else { token = tkOTHER; } break; case 't': /* FALLTHROUGH */ case 'T': if (nid == 7 && strncasecmp(sql, "trigger", 7) == 0 ) { token = tkTRIGGER; } else if (nid == 4 && strncasecmp(sql, "temp", 4) == 0) { token = tkTEMP; } else if (nid == 9 && strncasecmp(sql, "temporary", 9) == 0) { token = tkTEMP; } else { token = tkOTHER; } break; case 'e': /* FALLTHROUGH */ case 'E': if (nid == 3 && strncasecmp(sql, "end", 3) == 0) { token = tkEND; } else if (nid == 7 && strncasecmp(sql, "explain", 7) == 0) { token = tkEXPLAIN; } else { token = tkOTHER; } break; default: token = tkOTHER; break; } sql += nid - 1; } else { /* Operators and special symbols */ token = tkOTHER; } break; } state = trans[state][token]; sql++; } return state == 0; }