libdb/lang/sql/sqlite/ext/fts2/fts2_icu.c

/*
** 2007 June 22
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** This file implements a tokenizer for fts2 based on the ICU library.
** 
** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
*/

#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
#ifdef SQLITE_ENABLE_ICU

#include <assert.h>
#include <string.h>
#include "fts2_tokenizer.h"

#include <unicode/ubrk.h>
#include <unicode/ucol.h>
#include <unicode/ustring.h>
#include <unicode/utf16.h>

typedef struct IcuTokenizer IcuTokenizer;
typedef struct IcuCursor IcuCursor;

struct IcuTokenizer {
  sqlite3_tokenizer base;
  char *zLocale;
};

struct IcuCursor {
  sqlite3_tokenizer_cursor base;

  UBreakIterator *pIter;      /* ICU break-iterator object */
  int nChar;                  /* Number of UChar elements in pInput */
  UChar *aChar;               /* Copy of input using utf-16 encoding */
  int *aOffset;               /* Offsets of each character in utf-8 input */

  int nBuffer;
  char *zBuffer;

  int iToken;
};

/*
** Create a new tokenizer instance.
*/
static int icuCreate(
  int argc,                            /* Number of entries in argv[] */
  const char * const *argv,            /* Tokenizer creation arguments */
  sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
){
  IcuTokenizer *p;
  int n = 0;

  if( argc>0 ){
    n = strlen(argv[0])+1;
  }
  p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
  if( !p ){
    return SQLITE_NOMEM;
  }
  memset(p, 0, sizeof(IcuTokenizer));

  if( n ){
    p->zLocale = (char *)&p[1];
    memcpy(p->zLocale, argv[0], n);
  }

  *ppTokenizer = (sqlite3_tokenizer *)p;

  return SQLITE_OK;
}

/*
** Destroy a tokenizer
*/
static int icuDestroy(sqlite3_tokenizer *pTokenizer){
  IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
  sqlite3_free(p);
  return SQLITE_OK;
}

/*
** Prepare to begin tokenizing a particular string.  The input
** string to be tokenized is pInput[0..nBytes-1].  A cursor
** used to incrementally tokenize this string is returned in 
** *ppCursor.
*/
static int icuOpen(
  sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
  const char *zInput,                    /* Input string */
  int nInput,                            /* Length of zInput in bytes */
  sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
){
  IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
  IcuCursor *pCsr;

  const int32_t opt = U_FOLD_CASE_DEFAULT;
  UErrorCode status = U_ZERO_ERROR;
  int nChar;

  UChar32 c;
  int iInput = 0;
  int iOut = 0;

  *ppCursor = 0;

  if( nInput<0 ){
    nInput = strlen(zInput);
  }
  nChar = nInput+1;
  pCsr = (IcuCursor *)sqlite3_malloc(
      sizeof(IcuCursor) +                /* IcuCursor */
      nChar * sizeof(UChar) +            /* IcuCursor.aChar[] */
      (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
  );
  if( !pCsr ){
    return SQLITE_NOMEM;
  }
  memset(pCsr, 0, sizeof(IcuCursor));
  pCsr->aChar = (UChar *)&pCsr[1];
  pCsr->aOffset = (int *)&pCsr->aChar[nChar];

  pCsr->aOffset[iOut] = iInput;
  U8_NEXT(zInput, iInput, nInput, c); 
  while( c>0 ){
    int isError = 0;
    c = u_foldCase(c, opt);
    U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
    if( isError ){
      sqlite3_free(pCsr);
      return SQLITE_ERROR;
    }
    pCsr->aOffset[iOut] = iInput;

    if( iInput<nInput ){
      U8_NEXT(zInput, iInput, nInput, c);
    }else{
      c = 0;
    }
  }

  pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
  if( !U_SUCCESS(status) ){
    sqlite3_free(pCsr);
    return SQLITE_ERROR;
  }
  pCsr->nChar = iOut;

  ubrk_first(pCsr->pIter);
  *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
  return SQLITE_OK;
}

/*
** Close a tokenization cursor previously opened by a call to icuOpen().
*/
static int icuClose(sqlite3_tokenizer_cursor *pCursor){
  IcuCursor *pCsr = (IcuCursor *)pCursor;
  ubrk_close(pCsr->pIter);
  sqlite3_free(pCsr->zBuffer);
  sqlite3_free(pCsr);
  return SQLITE_OK;
}

/*
** Extract the next token from a tokenization cursor.
*/
static int icuNext(
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
  const char **ppToken,               /* OUT: *ppToken is the token text */
  int *pnBytes,                       /* OUT: Number of bytes in token */
  int *piStartOffset,                 /* OUT: Starting offset of token */
  int *piEndOffset,                   /* OUT: Ending offset of token */
  int *piPosition                     /* OUT: Position integer of token */
){
  IcuCursor *pCsr = (IcuCursor *)pCursor;

  int iStart = 0;
  int iEnd = 0;
  int nByte = 0;

  while( iStart==iEnd ){
    UChar32 c;

    iStart = ubrk_current(pCsr->pIter);
    iEnd = ubrk_next(pCsr->pIter);
    if( iEnd==UBRK_DONE ){
      return SQLITE_DONE;
    }

    while( iStart<iEnd ){
      int iWhite = iStart;
      U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
      if( u_isspace(c) ){
        iStart = iWhite;
      }else{
        break;
      }
    }
    assert(iStart<=iEnd);
  }

  do {
    UErrorCode status = U_ZERO_ERROR;
    if( nByte ){
      char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
      if( !zNew ){
        return SQLITE_NOMEM;
      }
      pCsr->zBuffer = zNew;
      pCsr->nBuffer = nByte;
    }

    u_strToUTF8(
        pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
        &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
        &status                                  /* Output success/failure */
    );
  } while( nByte>pCsr->nBuffer );

  *ppToken = pCsr->zBuffer;
  *pnBytes = nByte;
  *piStartOffset = pCsr->aOffset[iStart];
  *piEndOffset = pCsr->aOffset[iEnd];
  *piPosition = pCsr->iToken++;

  return SQLITE_OK;
}

/*
** The set of routines that implement the simple tokenizer
*/
static const sqlite3_tokenizer_module icuTokenizerModule = {
  0,                           /* iVersion */
  icuCreate,                   /* xCreate  */
  icuDestroy,                  /* xCreate  */
  icuOpen,                     /* xOpen    */
  icuClose,                    /* xClose   */
  icuNext,                     /* xNext    */
};

/*
** Set *ppModule to point at the implementation of the ICU tokenizer.
*/
void sqlite3Fts2IcuTokenizerModule(
  sqlite3_tokenizer_module const**ppModule
){
  *ppModule = &icuTokenizerModule;
}

#endif /* defined(SQLITE_ENABLE_ICU) */
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
Release 5.2.28 on 6/10/2011 2011-09-13 17:44:24 +00:00			`/*`
			`** 2007 June 22`
			`**`
			`** The author disclaims copyright to this source code. In place of`
			`** a legal notice, here is a blessing:`
			`**`
			`** May you do good and not evil.`
			`** May you find forgiveness for yourself and forgive others.`
			`** May you share freely, never taking more than you give.`
			`**`
			`*************************************************************************`
			`** This file implements a tokenizer for fts2 based on the ICU library.`
			`**`
			`** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $`
			`*/`

			`#if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS2)`
			`#ifdef SQLITE_ENABLE_ICU`

			`#include <assert.h>`
			`#include <string.h>`
			`#include "fts2_tokenizer.h"`

			`#include <unicode/ubrk.h>`
			`#include <unicode/ucol.h>`
			`#include <unicode/ustring.h>`
			`#include <unicode/utf16.h>`

			`typedef struct IcuTokenizer IcuTokenizer;`
			`typedef struct IcuCursor IcuCursor;`

			`struct IcuTokenizer {`
			`sqlite3_tokenizer base;`
			`char *zLocale;`
			`};`

			`struct IcuCursor {`
			`sqlite3_tokenizer_cursor base;`

			`UBreakIterator pIter; / ICU break-iterator object */`
			`int nChar; /* Number of UChar elements in pInput */`
			`UChar aChar; / Copy of input using utf-16 encoding */`
			`int aOffset; / Offsets of each character in utf-8 input */`

			`int nBuffer;`
			`char *zBuffer;`

			`int iToken;`
			`};`

			`/*`
			`** Create a new tokenizer instance.`
			`*/`
			`static int icuCreate(`
			`int argc, /* Number of entries in argv[] */`
			`const char * const argv, / Tokenizer creation arguments */`
			`sqlite3_tokenizer *ppTokenizer / OUT: Created tokenizer */`
			`){`
			`IcuTokenizer *p;`
			`int n = 0;`

			`if( argc>0 ){`
			`n = strlen(argv[0])+1;`
			`}`
			`p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);`
			`if( !p ){`
			`return SQLITE_NOMEM;`
			`}`
			`memset(p, 0, sizeof(IcuTokenizer));`

			`if( n ){`
			`p->zLocale = (char *)&p[1];`
			`memcpy(p->zLocale, argv[0], n);`
			`}`

			`ppTokenizer = (sqlite3_tokenizer )p;`

			`return SQLITE_OK;`
			`}`

			`/*`
			`** Destroy a tokenizer`
			`*/`
			`static int icuDestroy(sqlite3_tokenizer *pTokenizer){`
			`IcuTokenizer p = (IcuTokenizer )pTokenizer;`
			`sqlite3_free(p);`
			`return SQLITE_OK;`
			`}`

			`/*`
			`** Prepare to begin tokenizing a particular string. The input`
			`** string to be tokenized is pInput[0..nBytes-1]. A cursor`
			`** used to incrementally tokenize this string is returned in`
			`** *ppCursor.`
			`*/`
			`static int icuOpen(`
			`sqlite3_tokenizer pTokenizer, / The tokenizer */`
			`const char zInput, / Input string */`
			`int nInput, /* Length of zInput in bytes */`
			`sqlite3_tokenizer_cursor *ppCursor / OUT: Tokenization cursor */`
			`){`
			`IcuTokenizer p = (IcuTokenizer )pTokenizer;`
			`IcuCursor *pCsr;`

			`const int32_t opt = U_FOLD_CASE_DEFAULT;`
			`UErrorCode status = U_ZERO_ERROR;`
			`int nChar;`

			`UChar32 c;`
			`int iInput = 0;`
			`int iOut = 0;`

			`*ppCursor = 0;`

			`if( nInput<0 ){`
			`nInput = strlen(zInput);`
			`}`
			`nChar = nInput+1;`
			`pCsr = (IcuCursor *)sqlite3_malloc(`
			`sizeof(IcuCursor) + /* IcuCursor */`
			`nChar * sizeof(UChar) + /* IcuCursor.aChar[] */`
			`(nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */`
			`);`
			`if( !pCsr ){`
			`return SQLITE_NOMEM;`
			`}`
			`memset(pCsr, 0, sizeof(IcuCursor));`
			`pCsr->aChar = (UChar *)&pCsr[1];`
			`pCsr->aOffset = (int *)&pCsr->aChar[nChar];`

			`pCsr->aOffset[iOut] = iInput;`
			`U8_NEXT(zInput, iInput, nInput, c);`
			`while( c>0 ){`
			`int isError = 0;`
			`c = u_foldCase(c, opt);`
			`U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);`
			`if( isError ){`
			`sqlite3_free(pCsr);`
			`return SQLITE_ERROR;`
			`}`
			`pCsr->aOffset[iOut] = iInput;`

			`if( iInput<nInput ){`
			`U8_NEXT(zInput, iInput, nInput, c);`
			`}else{`
			`c = 0;`
			`}`
			`}`

			`pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);`
			`if( !U_SUCCESS(status) ){`
			`sqlite3_free(pCsr);`
			`return SQLITE_ERROR;`
			`}`
			`pCsr->nChar = iOut;`

			`ubrk_first(pCsr->pIter);`
			`ppCursor = (sqlite3_tokenizer_cursor )pCsr;`
			`return SQLITE_OK;`
			`}`

			`/*`
			`** Close a tokenization cursor previously opened by a call to icuOpen().`
			`*/`
			`static int icuClose(sqlite3_tokenizer_cursor *pCursor){`
			`IcuCursor pCsr = (IcuCursor )pCursor;`
			`ubrk_close(pCsr->pIter);`
			`sqlite3_free(pCsr->zBuffer);`
			`sqlite3_free(pCsr);`
			`return SQLITE_OK;`
			`}`

			`/*`
			`** Extract the next token from a tokenization cursor.`
			`*/`
			`static int icuNext(`
			`sqlite3_tokenizer_cursor pCursor, / Cursor returned by simpleOpen */`
			`const char *ppToken, / OUT: ppToken is the token text /`
			`int pnBytes, / OUT: Number of bytes in token */`
			`int piStartOffset, / OUT: Starting offset of token */`
			`int piEndOffset, / OUT: Ending offset of token */`
			`int piPosition / OUT: Position integer of token */`
			`){`
			`IcuCursor pCsr = (IcuCursor )pCursor;`

			`int iStart = 0;`
			`int iEnd = 0;`
			`int nByte = 0;`

			`while( iStart==iEnd ){`
			`UChar32 c;`

			`iStart = ubrk_current(pCsr->pIter);`
			`iEnd = ubrk_next(pCsr->pIter);`
			`if( iEnd==UBRK_DONE ){`
			`return SQLITE_DONE;`
			`}`

			`while( iStart<iEnd ){`
			`int iWhite = iStart;`
			`U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);`
			`if( u_isspace(c) ){`
			`iStart = iWhite;`
			`}else{`
			`break;`
			`}`
			`}`
			`assert(iStart<=iEnd);`
			`}`

			`do {`
			`UErrorCode status = U_ZERO_ERROR;`
			`if( nByte ){`
			`char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);`
			`if( !zNew ){`
			`return SQLITE_NOMEM;`
			`}`
			`pCsr->zBuffer = zNew;`
			`pCsr->nBuffer = nByte;`
			`}`

			`u_strToUTF8(`
			`pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */`
			`&pCsr->aChar[iStart], iEnd-iStart, /* Input vars */`
			`&status /* Output success/failure */`
			`);`
			`} while( nByte>pCsr->nBuffer );`

			`*ppToken = pCsr->zBuffer;`
			`*pnBytes = nByte;`
			`*piStartOffset = pCsr->aOffset[iStart];`
			`*piEndOffset = pCsr->aOffset[iEnd];`
			`*piPosition = pCsr->iToken++;`

			`return SQLITE_OK;`
			`}`

			`/*`
			`** The set of routines that implement the simple tokenizer`
			`*/`
			`static const sqlite3_tokenizer_module icuTokenizerModule = {`
			`0, /* iVersion */`
			`icuCreate, /* xCreate */`
			`icuDestroy, /* xCreate */`
			`icuOpen, /* xOpen */`
			`icuClose, /* xClose */`
			`icuNext, /* xNext */`
			`};`

			`/*`
			`** Set *ppModule to point at the implementation of the ICU tokenizer.`
			`*/`
			`void sqlite3Fts2IcuTokenizerModule(`
			`sqlite3_tokenizer_module const**ppModule`
			`){`
			`*ppModule = &icuTokenizerModule;`
			`}`

			`#endif /* defined(SQLITE_ENABLE_ICU) */`
			`#endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS2) */`