dbsql/src/sql_tokenize.c
2009-08-31 20:28:38 -04:00

793 lines
21 KiB
C

/*-
* DBSQL - A SQL database engine.
*
* Copyright (C) 2007-2008 The DBSQL Group, Inc. - All rights reserved.
*
* This library is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* There are special exceptions to the terms and conditions of the GPL as it
* is applied to this software. View the full text of the exception in file
* LICENSE_EXCEPTIONS in the directory of this software distribution.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
/*
* An tokenizer for SQL
*
* This file contains C code that splits an SQL input string up into
* individual tokens and sends those tokens one-by-one over to the
* parser for analysis.
*/
#include "dbsql_config.h"
#ifndef NO_SYSTEM_INCLUDES
#include <ctype.h>
#include <stdlib.h>
#endif
#include "dbsql_int.h"
/*
* All the keywords of the SQL language are stored as in a hash
* table composed of instances of the following structure.
*/
typedef struct keyword keyword_t;
struct keyword {
char *name; /* The keyword name */
u_int8_t token_type; /* token_t value for this keyword */
u_int8_t len; /* Length of this keyword */
u_int8_t inext; /* Index in sql_tokens_table[] of next with
same hash */
};
/*
* These are the keywords
*/
static keyword_t sql_tokens_table[] = {
{ "ABORT", TK_ABORT, },
{ "AFTER", TK_AFTER, },
{ "ALL", TK_ALL, },
{ "AND", TK_AND, },
{ "AS", TK_AS, },
{ "ASC", TK_ASC, },
{ "ATTACH", TK_ATTACH, },
{ "BEFORE", TK_BEFORE, },
{ "BEGIN", TK_BEGIN, },
{ "BETWEEN", TK_BETWEEN, },
{ "BY", TK_BY, },
{ "CASCADE", TK_CASCADE, },
{ "CASE", TK_CASE, },
{ "CHECK", TK_CHECK, },
{ "CLUSTER", TK_CLUSTER, },
{ "COLLATE", TK_COLLATE, },
{ "COMMIT", TK_COMMIT, },
{ "CONFLICT", TK_CONFLICT, },
{ "CONSTRAINT", TK_CONSTRAINT, },
{ "COPY", TK_COPY, },
{ "CREATE", TK_CREATE, },
{ "CROSS", TK_JOIN_KW, },
{ "DATABASE", TK_DATABASE, },
{ "DEFAULT", TK_DEFAULT, },
{ "DEFERRED", TK_DEFERRED, },
{ "DEFERRABLE", TK_DEFERRABLE, },
{ "DELETE", TK_DELETE, },
{ "DELIMITERS", TK_DELIMITERS, },
{ "DESC", TK_DESC, },
{ "DETACH", TK_DETACH, },
{ "DISTINCT", TK_DISTINCT, },
{ "DROP", TK_DROP, },
{ "END", TK_END, },
{ "EACH", TK_EACH, },
{ "ELSE", TK_ELSE, },
{ "EXCEPT", TK_EXCEPT, },
{ "EXPLAIN", TK_EXPLAIN, },
{ "FAIL", TK_FAIL, },
{ "FOR", TK_FOR, },
{ "FOREIGN", TK_FOREIGN, },
{ "FROM", TK_FROM, },
{ "FULL", TK_JOIN_KW, },
{ "GLOB", TK_GLOB, },
{ "GROUP", TK_GROUP, },
{ "HAVING", TK_HAVING, },
{ "IGNORE", TK_IGNORE, },
{ "IMMEDIATE", TK_IMMEDIATE, },
{ "IN", TK_IN, },
{ "INDEX", TK_INDEX, },
{ "INITIALLY", TK_INITIALLY, },
{ "INNER", TK_JOIN_KW, },
{ "INSERT", TK_INSERT, },
{ "INSTEAD", TK_INSTEAD, },
{ "INTERSECT", TK_INTERSECT, },
{ "INTO", TK_INTO, },
{ "IS", TK_IS, },
{ "ISNULL", TK_ISNULL, },
{ "JOIN", TK_JOIN, },
{ "KEY", TK_KEY, },
{ "LEFT", TK_JOIN_KW, },
{ "LIKE", TK_LIKE, },
{ "LIMIT", TK_LIMIT, },
{ "MATCH", TK_MATCH, },
{ "NATURAL", TK_JOIN_KW, },
{ "NOT", TK_NOT, },
{ "NOTNULL", TK_NOTNULL, },
{ "NULL", TK_NULL, },
{ "OF", TK_OF, },
{ "OFFSET", TK_OFFSET, },
{ "ON", TK_ON, },
{ "OR", TK_OR, },
{ "ORDER", TK_ORDER, },
{ "OUTER", TK_JOIN_KW, },
{ "PRAGMA", TK_PRAGMA, },
{ "PRIMARY", TK_PRIMARY, },
{ "RAISE", TK_RAISE, },
{ "REFERENCES", TK_REFERENCES, },
{ "REPLACE", TK_REPLACE, },
{ "RESTRICT", TK_RESTRICT, },
{ "RIGHT", TK_JOIN_KW, },
{ "ROLLBACK", TK_ROLLBACK, },
{ "ROW", TK_ROW, },
{ "SELECT", TK_SELECT, },
{ "SET", TK_SET, },
{ "STATEMENT", TK_STATEMENT, },
{ "TABLE", TK_TABLE, },
{ "TEMP", TK_TEMP, },
{ "TEMPORARY", TK_TEMP, },
{ "THEN", TK_THEN, },
{ "TRANSACTION", TK_TRANSACTION, },
{ "TRIGGER", TK_TRIGGER, },
{ "UNION", TK_UNION, },
{ "UNIQUE", TK_UNIQUE, },
{ "UPDATE", TK_UPDATE, },
{ "USING", TK_USING, },
{ "VACUUM", TK_VACUUM, },
{ "VALUES", TK_VALUES, },
{ "VIEW", TK_VIEW, },
{ "WHEN", TK_WHEN, },
{ "WHERE", TK_WHERE, },
};
/*
* This is the hash table
*/
#define KEY_HASH_SIZE 101
static u_int8_t ai_table[KEY_HASH_SIZE];
/*
* __get_keyword_code --
* This function looks up an identifier to determine if it is a
* keyword. If it is a keyword, the token code of that keyword is
* returned. If the input is not a keyword, TK_ID is returned.
*
* PUBLIC: int get_keyword_code __P((const char *, int));
*/
int
__get_keyword_code(z, n)
const char *z;
int n;
{
int h, i;
int nk;
keyword_t *p;
static char need_init = 1;
if (need_init) { /* TODO: beginning of what used to be mutex'ed */
/* Initialize the keyword hash table */
need_init = 0;
nk = sizeof(sql_tokens_table) /
sizeof(sql_tokens_table[0]);
for (i = 0; i < nk; i++) {
sql_tokens_table[i].len =
strlen(sql_tokens_table[i].name);
h = __hash_ignore_case(sql_tokens_table[i].name,
sql_tokens_table[i].len);
h %= KEY_HASH_SIZE;
sql_tokens_table[i].inext = ai_table[h];
ai_table[h] = i+1;
}
} /* TODO: end of what used to be mutex'ed */
h = __hash_ignore_case(z, n) % KEY_HASH_SIZE;
for (i = ai_table[h]; i; i = p->inext) {
p = &sql_tokens_table[i-1];
if (p->len == n &&
strncasecmp(p->name, z, n) == 0) {
return p->token_type;
}
}
return TK_ID;
}
/*
* If X is a character that can be used in an identifier and
* X&0x80==0 then id_char_p[X] will be 1. If X&0x80==0x80 then
* X is always an identifier character. (Hence all UTF-8
* characters can be part of an identifier). id_char_p[X] will
* be 0 for every character in the lower 128 ASCII characters
* that cannot be used as part of an identifier.
*
* In this implementation, an identifier can be a string of
* alphabetic characters, digits, and "_" plus any character
* with the high-order bit set. The latter rule means that
* any sequence of UTF-8 characters or characters taken from
* an extended ISO8859 character set can form an identifier.
*/
static const char id_char_p[] = {
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
};
/*
* __get_token --
* Return the length of the token that begins at z[0].
* Store the token type in *token_type before returning.
*
* STATIC: static int __get_token __P((const unsigned char *, int *));
*/
static int
__get_token(z, token_type)
const unsigned char *z;
int *token_type;
{
int i, delim;
switch(*z) {
case ' ': /* FALLTHROUGH */
case '\t': /* FALLTHROUGH */
case '\n': /* FALLTHROUGH */
case '\f': /* FALLTHROUGH */
case '\r': /* FALLTHROUGH */
i = 1;
while (isspace(z[i])) {
i++;
}
*token_type = TK_SPACE;
return i;
break;
case '-':
if (z[1] == '-') {
i = 2;
while (z[i] && z[i] != '\n') {
i++;
}
*token_type = TK_COMMENT;
return i;
}
*token_type = TK_MINUS;
return 1;
break;
case '(':
*token_type = TK_LP;
return 1;
break;
case ')':
*token_type = TK_RP;
return 1;
break;
case ';':
*token_type = TK_SEMI;
return 1;
break;
case '+':
*token_type = TK_PLUS;
return 1;
break;
case '*':
*token_type = TK_STAR;
return 1;
break;
case '/':
if (z[1] != '*' || z[2] == 0) {
*token_type = TK_SLASH;
return 1;
}
i = 3;
while (z[i] && (z[i]!='/' || z[i-1]!='*')) {
i++;
}
if (z[i])
i++;
*token_type = TK_COMMENT;
return i;
break;
case '%':
*token_type = TK_REM;
return 1;
break;
case '=':
*token_type = TK_EQ;
return 1 + (z[1] == '=');
break;
case '<':
if (z[1] == '=') {
*token_type = TK_LE;
return 2;
} else if (z[1] == '>') {
*token_type = TK_NE;
return 2;
} else if (z[1] == '<') {
*token_type = TK_LSHIFT;
return 2;
} else {
*token_type = TK_LT;
return 1;
}
break;
case '>':
if (z[1] == '=') {
*token_type = TK_GE;
return 2;
} else if (z[1] == '>') {
*token_type = TK_RSHIFT;
return 2;
} else {
*token_type = TK_GT;
return 1;
}
break;
case '!':
if (z[1] != '=') {
*token_type = TK_ILLEGAL;
return 2;
} else {
*token_type = TK_NE;
return 2;
}
break;
case '|':
if (z[1] != '|') {
*token_type = TK_BITOR;
return 1;
} else {
*token_type = TK_CONCAT;
return 2;
}
break;
case ',':
*token_type = TK_COMMA;
return 1;
break;
case '&':
*token_type = TK_BITAND;
return 1;
break;
case '~':
*token_type = TK_BITNOT;
return 1;
break;
case '\'': /* FALLTHROUGH */
case '"':
delim = z[0];
for (i = 1; z[i]; i++) {
if (z[i] == delim) {
if (z[i+1] == delim) {
i++;
} else {
break;
}
}
}
if (z[i])
i++;
*token_type = TK_STRING;
return i;
case '.':
*token_type = TK_DOT;
return 1;
break;
case '0': /* FALLTHROUGH */
case '1': /* FALLTHROUGH */
case '2': /* FALLTHROUGH */
case '3': /* FALLTHROUGH */
case '4': /* FALLTHROUGH */
case '5': /* FALLTHROUGH */
case '6': /* FALLTHROUGH */
case '7': /* FALLTHROUGH */
case '8': /* FALLTHROUGH */
case '9':
*token_type = TK_INTEGER;
i = 1;
while (isdigit(z[i])) {
i++;
}
if (z[i] == '.' && isdigit(z[i+1])) {
i += 2;
while(isdigit(z[i])) {
i++;
}
*token_type = TK_FLOAT;
}
if ((z[i] == 'e' || z[i] == 'E') &&
(isdigit(z[i+1]) ||
((z[i+1] == '+' || z[i+1] == '-') && isdigit(z[i+2])))) {
i += 2;
while(isdigit(z[i])) {
i++;
}
*token_type = TK_FLOAT;
}
return i;
break;
case '[':
i = 1;
while (z[i] && z[i-1] != ']') {
i++;
}
*token_type = TK_ID;
return i;
break;
case '?':
*token_type = TK_VARIABLE;
return 1;
break;
default:
if ((*z & 0x80) == 0 && !id_char_p[*z]) {
break;
}
i = 1;
while((z[i] & 0x80)!=0 || id_char_p[z[i]]) {
i++;
}
*token_type = __get_keyword_code((char*)z, i);
return i;
break;
}
*token_type = TK_ILLEGAL;
return 1;
}
/*
* __run_sql_parser --
* Run the parser on the given SQL string. The parser structure is
* passed in. A DBSQL_ status code is returned.
*
* PUBLIC: int __run_sql_parser __P((parser_t *, const char *, char **));
*/
/*TODO: REMOVE THIS If an error occurs
* and pzErrMsg!=NULL then an error message might be written into
* memory obtained from malloc() and *pzErrMsg made to point to that
* error message. Or maybe not.
*/
int
__run_sql_parser(parser, sql, err_msgs)
parser_t *parser;
const char *sql;
char **err_msgs;
{
int nerr = 0;
int i;
void *engine;
int token_type;
int last_token_parsed = -1;
DBSQL *dbp = parser->db;
extern void *__sql_parser_alloc(DBSQL *, int(*)(DBSQL*,size_t,void *));
extern void __sql_parser_free(DBSQL *, void *, void(*)(DBSQL *,void*));
extern int __sql_parser(void*, int, token_t, parser_t*);
dbp->flags &= ~DBSQL_Interrupt;
parser->rc = DBSQL_SUCCESS;
i = 0;
engine = __sql_parser_alloc(dbp, __dbsql_malloc);
if (engine == 0) {
__str_append(err_msgs, "out of memory", (char*)0);
return 1;
}
parser->sLastToken.dyn = 0;
parser->zTail = sql;
while (parser->rc == DBSQL_SUCCESS && sql[i] != 0) {
DBSQL_ASSERT(i >= 0);
parser->sLastToken.z = &sql[i];
DBSQL_ASSERT(parser->sLastToken.dyn == 0);
parser->sLastToken.n = __get_token((unsigned char*)&sql[i],
&token_type);
i += parser->sLastToken.n;
switch (token_type) {
case TK_SPACE: /* FALLTHROUGH */
case TK_COMMENT:
if ((dbp->flags & DBSQL_Interrupt) != 0) {
parser->rc = DBSQL_INTERRUPTED;
__str_append(err_msgs, "interrupt",
(char*)0);
goto abort_parse;
}
break;
case TK_ILLEGAL:
__str_nappend(err_msgs, "unrecognized token: \"",
-1, parser->sLastToken.z,
parser->sLastToken.n, "\"", 1, NULL);
nerr++;
goto abort_parse;
break;
case TK_SEMI:
parser->zTail = &sql[i];
/* FALLTHROUGH */
default:
__sql_parser(engine, token_type, parser->sLastToken,
parser);
last_token_parsed = token_type;
if (parser->rc != DBSQL_SUCCESS) {
goto abort_parse;
}
break;
}
}
abort_parse:
if (sql[i] == 0 && nerr == 0 && parser->rc == DBSQL_SUCCESS) {
if (last_token_parsed != TK_SEMI) {
__sql_parser(engine, TK_SEMI, parser->sLastToken,
parser);
parser->zTail = &sql[i];
}
__sql_parser(engine, 0, parser->sLastToken, parser);
}
__sql_parser_free(dbp, engine, __dbsql_free);
if (parser->rc != DBSQL_SUCCESS && parser->rc != DBSQL_DONE &&
parser->zErrMsg == 0) {
__str_append(&parser->zErrMsg,
dbsql_strerror(parser->rc), (char*)0);
}
if (parser->zErrMsg) {
if (err_msgs && *err_msgs == 0) {
*err_msgs = parser->zErrMsg;
} else {
__dbsql_free(dbp, parser->zErrMsg);
}
parser->zErrMsg = 0;
if (!nerr)
nerr++;
}
if (parser->pVdbe && (parser->useCallback || parser->nErr > 0)) {
__vdbe_delete(parser->pVdbe);
parser->pVdbe = 0;
}
if (parser->pNewTable) {
__vdbe_delete_table(parser->db, parser->pNewTable);
parser->pNewTable = 0;
}
if (parser->pNewTrigger) {
__vdbe_delete_trigger(parser->pNewTrigger);
parser->pNewTrigger = 0;
}
if (nerr > 0 &&
(parser->rc == DBSQL_SUCCESS || parser->rc == DBSQL_DONE)) {
parser->rc = DBSQL_ERROR;
}
return nerr;
}
/*
* Token types used by the dbsql_complete_stmt() routine. See the header
* comments on that procedure for additional information.
*/
#define tkEXPLAIN 0
#define tkCREATE 1
#define tkTEMP 2
#define tkTRIGGER 3
#define tkEND 4
#define tkSEMI 5
#define tkWS 6
#define tkOTHER 7
/*
* dbsql_complete_stmt --
*
* Return TRUE if the given SQL string ends in a semicolon.
*
* Special handling is require for CREATE TRIGGER statements.
* Whenever the CREATE TRIGGER keywords are seen, the statement
* must end with ";END;".
*
* This implementation uses a state machine with 7 states:
*
* (0) START At the beginning or end of an SQL statement. This routine
* returns 1 if it ends in the START state and 0 if it ends
* in any other state.
*
* (1) EXPLAIN The keyword EXPLAIN has been seen at the beginning of
* a statement.
*
* (2) CREATE The keyword CREATE has been seen at the beginning of a
* statement, possibly preceeded by EXPLAIN and/or followed by
* TEMP or TEMPORARY
*
* (3) NORMAL We are in the middle of statement which ends with a single
* semicolon.
*
* (4) TRIGGER We are in the middle of a trigger definition that must be
* ended by a semicolon, the keyword END, and another
* semicolon.
*
* (5) SEMI We've seen the first semicolon in the ";END;" that occurs at
* the end of a trigger definition.
*
* (6) END We've seen the ";END" of the ";END;" that occurs at the end
* of a trigger difinition.
*
* Transitions between states above are determined by tokens extracted
* from the input. The following tokens are significant:
*
* (0) tkEXPLAIN The "explain" keyword.
* (1) tkCREATE The "create" keyword.
* (2) tkTEMP The "temp" or "temporary" keyword.
* (3) tkTRIGGER The "trigger" keyword.
* (4) tkEND The "end" keyword.
* (5) tkSEMI A semicolon.
* (6) tkWS Whitespace
* (7) tkOTHER Any other SQL token.
*
* Whitespace never causes a state transition and is always ignored.
*
* EXTERN: int dbsql_complete_stmt __P((const char *));
*
*/
int
dbsql_complete_stmt(sql)
const char *sql;
{
u_int8_t state = 0; /* Current state, using values from comment */
u_int8_t token; /* Value of the next token */
int c;
/*
* The following matrix defines the transition from one state to
* another according to what token is seen. trans[state][token]
* returns the next state.
*/
static const u_int8_t trans[7][8] = {
/* Token: */
/* State: ** EXPLAIN CREATE TEMP TRIGGER END SEMI WS OTHER */
/* 0 START: */ { 1, 2, 3, 3, 3, 0, 0, 3, },
/* 1 EXPLAIN: */ { 3, 2, 3, 3, 3, 0, 1, 3, },
/* 2 CREATE: */ { 3, 3, 2, 4, 3, 0, 2, 3, },
/* 3 NORMAL: */ { 3, 3, 3, 3, 3, 0, 3, 3, },
/* 4 TRIGGER: */ { 4, 4, 4, 4, 4, 5, 4, 4, },
/* 5 SEMI: */ { 4, 4, 4, 4, 6, 5, 5, 4, },
/* 6 END: */ { 4, 4, 4, 4, 4, 0, 6, 4, },
};
while (*sql) {
switch (*sql) {
case ';':
token = tkSEMI;
break;
case ' ': /* FALLTHROUGH */
case '\r': /* FALLTHROUGH */
case '\t': /* FALLTHROUGH */
case '\n': /* FALLTHROUGH */
case '\f':
/* White space is ignored */
token = tkWS;
break;
case '/':
/* C-style comments */
if (sql[1] != '*') {
token = tkOTHER;
break;
}
sql += 2;
while (sql[0] && (sql[0] != '*' || sql[1] != '/')) {
sql++;
}
if (sql[0] == 0)
return 0;
sql++;
token = tkWS;
break;
case '-':
/* SQL-style comments from "--" to end of line */
if (sql[1] != '-') {
token = tkOTHER;
break;
}
while (*sql && *sql != '\n') {
sql++;
}
if (*sql == 0)
return state == 0;
token = tkWS;
break;
case '[':
/* Microsoft-style identifiers in [...] */
sql++;
while (*sql && *sql!=']') {
sql++;
}
if (*sql == 0)
return 0;
token = tkOTHER;
break;
case '"':
/* single- and double-quoted strings */
/* FALLTHROUGH */
case '\'':
c = *sql;
sql++;
while (*sql && *sql != c) {
sql++;
}
if (*sql == 0)
return 0;
token = tkOTHER;
break;
default:
if (id_char_p[(u_int8_t)*sql]) {
/* Keywords and unquoted identifiers */
int nid = 1;
while (id_char_p[(u_int8_t)sql[nid]]) {
nid++;
}
switch (*sql) {
case 'c': /* FALLTHROUGH */
case 'C':
if (nid == 6 &&
strncasecmp(sql,
"create",
6) == 0) {
token = tkCREATE;
} else {
token = tkOTHER;
}
break;
case 't': /* FALLTHROUGH */
case 'T':
if (nid == 7 &&
strncasecmp(sql,
"trigger", 7) == 0 ) {
token = tkTRIGGER;
} else if (nid == 4 &&
strncasecmp(sql,
"temp", 4) == 0) {
token = tkTEMP;
} else if (nid == 9 &&
strncasecmp(sql,
"temporary", 9) == 0) {
token = tkTEMP;
} else {
token = tkOTHER;
}
break;
case 'e': /* FALLTHROUGH */
case 'E':
if (nid == 3 &&
strncasecmp(sql,
"end", 3) == 0) {
token = tkEND;
} else if (nid == 7 &&
strncasecmp(sql,
"explain", 7) == 0) {
token = tkEXPLAIN;
} else {
token = tkOTHER;
}
break;
default:
token = tkOTHER;
break;
}
sql += nid - 1;
} else {
/* Operators and special symbols */
token = tkOTHER;
}
break;
}
state = trans[state][token];
sql++;
}
return state == 0;
}