stasis-bLSM/UCharUtils.cc

/* $Id: UCharUtils.cc,v 1.16 2009/03/03 20:19:18 dlomax Exp $ */
/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */

//#include <dht/UCharUtils.h>
#include "UCharUtils.h"
#include <log4cpp/Category.hh>
#include "LogUtils.h"
//#include "ActionContext.h"
#include <unicode/ucnv.h>
#include <unicode/unorm.h>
#include <thoth/validate.h> // To make sure we have UTF-8

static log4cpp::Category &log = 
                    log4cpp::Category::getInstance("dht.framework." __FILE__);


UCharUtilsImpl *UCharUtils::instance_ = NULL;

UCharUtilsImpl::
UCharUtilsImpl() : uconv_(NULL) { 
    LOG_METHOD();

    ucBuffLen = 0;
    ucBuff = NULL;

    ucNormBuffLen = 0;
    ucNormBuff = NULL;

    charBuffLen = 0;
    charBuff = NULL;
}

FwCode::ResponseCode UCharUtilsImpl::
init()
{
    UErrorCode erc = U_ZERO_ERROR;

    uconv_ = ucnv_open("utf-8", &erc);
    if (uconv_ == NULL) {
        DHT_ERROR_STREAM() << "EC:UNICODE:Problem geting utf-8 converter, erc:" << erc
                           << ", " << u_errorName(erc);
        return FwCode::UcnvOpenFailed;
    }
    return FwCode::FwOk;
}

UCharUtilsImpl::
~UCharUtilsImpl() {
    reset();
    if (uconv_ != NULL) {
        ucnv_close(uconv_);
        uconv_ = NULL;
    }
}

void UCharUtilsImpl::
reset() {
    LOG_METHOD();

    if (ucBuff != NULL) {
        delete[] ucBuff;
        ucBuffLen = 0;
        ucBuff = NULL;
    }
    if (ucNormBuff != NULL) {
        delete[] ucNormBuff;
        ucNormBuffLen = 0;
        ucNormBuff = NULL;
    }
    if (charBuff != NULL) {
        delete[] charBuff;
        charBuffLen = 0;
        charBuff = NULL;
    }
}

/**
 * Small wrapper to hide multi-line thoth api inside single-line call.
 */
bool UCharUtils::
isUTF8(const std::string& value)
{
    size_t pos = 0;
    thoth_result result = thoth_validate_utf8(value.c_str(), value.length(),
                                              &pos);
			
    if(result != UTF8_VALID) {
        std::cerr 
            //RESPONSE_DEBUG_STREAM(FwCode::DataNotUtf8)
            << "value (" << value << ") is not UTF-8. thoth_result:" << result
            << ", position=" << pos;
        return false;
    }
    return true;
}

/**
 * Small wrapper to hide multi-line thoth api inside single-line call.
 */
bool UCharUtils::
isUTF8(const char * value, size_t value_len)
{
    size_t pos = 0;
    thoth_result result = thoth_validate_utf8(value, value_len, &pos);
			
    if(result != UTF8_VALID) {
        //RESPONSE_DEBUG_STREAM(FwCode::DataNotUtf8)
        std::cerr
            << "value (" << std::string(value, value_len)
            << ") is not UTF-8. thoth_result:" << result
            << ", position=" << pos;
        return false;
    }
    return true;
}

// Convert an input string (expected to be UTF-8) into unicode UChars
// The result of the conversion will be sitting in our ucBuff area.
FwCode::ResponseCode UCharUtilsImpl::
convert(const std::string &input, int32_t &len)
{
    LOG_METHOD();

    //UTF-8 validation
    if(!UCharUtils::isUTF8(input)) {
        return FwCode::DataNotUtf8;
    }

    int size = input.length() * 2;

    // Check if we already have a big enough buffer
    if (ucBuffLen < size) {
        // Nope, first check if we need to release what we've been using
        if (ucBuff) {
            delete[] ucBuff;
        }
        ucBuffLen = size;
        ucBuff = new UChar[ucBuffLen];
    }

    UErrorCode erc = U_ZERO_ERROR;
    len = ucnv_toUChars(uconv_, 
                        ucBuff, 
                        ucBuffLen,
                        input.data(), 
                        input.length(), &erc);

    if (U_FAILURE(erc)) {
        //RESPONSE_ERROR_STREAM(FwCode::ConvertToUCharFailed)
        std::cerr
            << "EC:UNICODE:error:" << erc
                                                            << ", " << u_errorName(erc)
                           << " from converting input:'" << input << "'";
        len = 0;
        return FwCode::ConvertToUCharFailed;
    }
    return FwCode::FwOk;
}

// Normalize an input string. Note that all three internal buffers will
// be used by this operation, but by the time we finish, we'll be done
// with them.
FwCode::ResponseCode UCharUtilsImpl::
normalize(const std::string &input, std::string &result /* out */)
{
    LOG_METHOD();

    // convert our UTF-8 into UChar
    int32_t inLen = 0;
    FwCode::ResponseCode rc = convert(input, inLen);

    if (rc != FwCode::FwOk) {
        result.erase();
        return rc;
    }

    // Do a quick check if the input is already normalized so that
    // we can duck out early
    UErrorCode status = U_ZERO_ERROR;
    if (unorm_quickCheck(ucBuff, inLen,
                         UNORM_NFC, &status) == UNORM_YES) {
        DHT_DEBUG_STREAM() << "already normalized input:" << input;
        result = input;
        return FwCode::FwOk;
    }

    // Check if we have enough space for the normalized result.
    // We'll make the output space twice as big as the input (although
    // it's more likely that the normalized result will be shorter
    // as it combines characters. E.g. 'A' 'put an accent on the previous'
    int32_t newSize = inLen * 2;
    if (newSize > ucNormBuffLen) {
        DHT_DEBUG_STREAM() << "newSize:" << newSize
                           << " ucNormBuffLen:" << ucNormBuffLen;
        if (ucNormBuff) {
            delete[] ucNormBuff;
        }
        ucNormBuffLen = newSize;
        ucNormBuff = new UChar[ucNormBuffLen];
    }

    // Do the actual normalization
    status = U_ZERO_ERROR;
    int32_t normLen = unorm_normalize(ucBuff, inLen,
                                                        UNORM_NFC, 0,
                                                        ucNormBuff, 
                                                        ucNormBuffLen,
                                                        &status);
    if(U_FAILURE(status)) {
        //RESPONSE_ERROR_STREAM(FwCode::FwError)
        std::cerr
            << "EC:UNICODE:error:" << status << ", " << u_errorName(status)
                           <<" in unorm_normalize, inLen:" << inLen
                           << " ucNormBuffLen:" << ucNormBuffLen;
        return FwCode::FwError;
    }

    // Make sure we have some space to convert back to UTF-8
    int32_t resultLen = normLen * 4;
    if (resultLen > charBuffLen) {
        DHT_DEBUG_STREAM() << "resultLen:" << resultLen
                           << " charBuffLen:" << charBuffLen;
        if (charBuff) {
            delete[] charBuff;
            charBuff= NULL;
        }
        charBuffLen = resultLen;
        charBuff = new char[charBuffLen];
    }

    DHT_DEBUG_STREAM() <<"calling ucnv_fromUChars, normLen:" << normLen;

    // Go from UChar array to UTF-8
    int32_t actualLen = ucnv_fromUChars(uconv_,
                                                          charBuff, charBuffLen,
                                                          ucNormBuff, normLen,
                                                          &status);
    if(U_FAILURE(status)) {
        //RESPONSE_ERROR_STREAM(FwCode::FwError)
        std::cerr
            << "EC:UNICODE:error:" << status << ", " << u_errorName(status)
                           << " in ucnv_fromUChars charBuffLen:" << charBuffLen
                           << " normLen:" << normLen;
        return FwCode::FwError;
    }

    // Smack our UTF-8 characters into the result string
    result.assign(charBuff, actualLen);
    DHT_DEBUG_STREAM() << "leaving actualLen:" << actualLen
                       << " result:" << result;
    return FwCode::FwOk;
}


FwCode::ResponseCode UCharUtils::
init()
{
    if (instance_ == NULL) {
        instance_ = new UCharUtilsImpl();
        return instance_->init();
    }
    return FwCode::FwOk;  // already initialized
}

void UCharUtils::
close()
{
    if(instance_ != NULL) {
        delete instance_;
        instance_ = NULL;
    }
}

// Given an input string, return a unicode UChar array. Note that the 
// return value is a pointer to our internal buffer.
UChar * UCharUtils::
getUChar(const std::string &input, int32_t& len) {
    LOG_METHOD();

    // do the conversion...somehow need 2x input len for utf8 to utf16
    if(instance_->convert(input, len) != FwCode::FwOk) {
        len = 0;
        return NULL;
    }

    return instance_->ucBuff;
}

FwCode::ResponseCode UCharUtils::
normalize(const std::string &input, std::string &result) {
    LOG_METHOD();
    return(instance_->normalize(input, result));
}


FwCode::ResponseCode UCharUtils::
parseRegExpPattern(const std::string &pattern,
                   URegularExpression * & result /* out */)
{
    UParseError perr;
    UErrorCode erc = U_ZERO_ERROR;
    int32_t ureglen = 0;

    // Do not delete uregexp, it's a static reusable buffer inside UCharUtils
    UChar *uregexp = UCharUtils::getUChar(pattern, ureglen);
    if (uregexp == NULL) {
        //RESPONSE_ERROR_STREAM(FwCode::ConvertToUCharFailed)
        std::cerr
            << "EC:UNICODE|IMPOSSIBLE:Unable to convert pattern to unicode: " << pattern;
        return FwCode::ConvertToUCharFailed;
    }

    URegularExpression *regexp= uregex_open(uregexp, ureglen, 0, 
                                            &perr, 
                                            &erc);
    if(erc != U_ZERO_ERROR) {
        //RESPONSE_DEBUG_STREAM(FwCode::CompileRegExFailed)
        std::cerr
            << "Compiling regex failed at: " << perr.offset
            << "; re=" << pattern;
        return FwCode::CompileRegExFailed;
    }
    
    result = regexp;
    return FwCode::FwOk;
}
initial import; removed cruft from mert's tarball, tweaked make's clean targets git-svn-id: svn+ssh://svn.corp.yahoo.com/yahoo/yrl/labs/pnuts/code/logstore@520 8dad8b1f-cf64-0410-95b6-bcf113ffbcfe 2010-01-23 02:13:59 +00:00			`/* $Id: UCharUtils.cc,v 1.16 2009/03/03 20:19:18 dlomax Exp $ */`
			`/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */`

			`//#include <dht/UCharUtils.h>`
			`#include "UCharUtils.h"`
			`#include <log4cpp/Category.hh>`
			`#include "LogUtils.h"`
			`//#include "ActionContext.h"`
			`#include <unicode/ucnv.h>`
			`#include <unicode/unorm.h>`
			`#include <thoth/validate.h> // To make sure we have UTF-8`

			`static log4cpp::Category &log =`
			`log4cpp::Category::getInstance("dht.framework." __FILE__);`


			`UCharUtilsImpl *UCharUtils::instance_ = NULL;`

			`UCharUtilsImpl::`
			`UCharUtilsImpl() : uconv_(NULL) {`
			`LOG_METHOD();`

			`ucBuffLen = 0;`
			`ucBuff = NULL;`

			`ucNormBuffLen = 0;`
			`ucNormBuff = NULL;`

			`charBuffLen = 0;`
			`charBuff = NULL;`
			`}`

			`FwCode::ResponseCode UCharUtilsImpl::`
			`init()`
			`{`
			`UErrorCode erc = U_ZERO_ERROR;`

			`uconv_ = ucnv_open("utf-8", &erc);`
			`if (uconv_ == NULL) {`
			`DHT_ERROR_STREAM() << "EC:UNICODE:Problem geting utf-8 converter, erc:" << erc`
			`<< ", " << u_errorName(erc);`
			`return FwCode::UcnvOpenFailed;`
			`}`
			`return FwCode::FwOk;`
			`}`

			`UCharUtilsImpl::`
			`~UCharUtilsImpl() {`
			`reset();`
			`if (uconv_ != NULL) {`
			`ucnv_close(uconv_);`
			`uconv_ = NULL;`
			`}`
			`}`

			`void UCharUtilsImpl::`
			`reset() {`
			`LOG_METHOD();`

			`if (ucBuff != NULL) {`
			`delete[] ucBuff;`
			`ucBuffLen = 0;`
			`ucBuff = NULL;`
			`}`
			`if (ucNormBuff != NULL) {`
			`delete[] ucNormBuff;`
			`ucNormBuffLen = 0;`
			`ucNormBuff = NULL;`
			`}`
			`if (charBuff != NULL) {`
			`delete[] charBuff;`
			`charBuffLen = 0;`
			`charBuff = NULL;`
			`}`
			`}`

			`/**`
			`* Small wrapper to hide multi-line thoth api inside single-line call.`
			`*/`
			`bool UCharUtils::`
			`isUTF8(const std::string& value)`
			`{`
			`size_t pos = 0;`
			`thoth_result result = thoth_validate_utf8(value.c_str(), value.length(),`
			`&pos);`

			`if(result != UTF8_VALID) {`
			`std::cerr`
			`//RESPONSE_DEBUG_STREAM(FwCode::DataNotUtf8)`
			`<< "value (" << value << ") is not UTF-8. thoth_result:" << result`
			`<< ", position=" << pos;`
			`return false;`
			`}`
			`return true;`
			`}`

			`/**`
			`* Small wrapper to hide multi-line thoth api inside single-line call.`
			`*/`
			`bool UCharUtils::`
			`isUTF8(const char * value, size_t value_len)`
			`{`
			`size_t pos = 0;`
			`thoth_result result = thoth_validate_utf8(value, value_len, &pos);`

			`if(result != UTF8_VALID) {`
			`//RESPONSE_DEBUG_STREAM(FwCode::DataNotUtf8)`
			`std::cerr`
			`<< "value (" << std::string(value, value_len)`
			`<< ") is not UTF-8. thoth_result:" << result`
			`<< ", position=" << pos;`
			`return false;`
			`}`
			`return true;`
			`}`

			`// Convert an input string (expected to be UTF-8) into unicode UChars`
			`// The result of the conversion will be sitting in our ucBuff area.`
			`FwCode::ResponseCode UCharUtilsImpl::`
			`convert(const std::string &input, int32_t &len)`
			`{`
			`LOG_METHOD();`

			`//UTF-8 validation`
			`if(!UCharUtils::isUTF8(input)) {`
			`return FwCode::DataNotUtf8;`
			`}`

			`int size = input.length() * 2;`

			`// Check if we already have a big enough buffer`
			`if (ucBuffLen < size) {`
			`// Nope, first check if we need to release what we've been using`
			`if (ucBuff) {`
			`delete[] ucBuff;`
			`}`
			`ucBuffLen = size;`
			`ucBuff = new UChar[ucBuffLen];`
			`}`

			`UErrorCode erc = U_ZERO_ERROR;`
			`len = ucnv_toUChars(uconv_,`
			`ucBuff,`
			`ucBuffLen,`
			`input.data(),`
			`input.length(), &erc);`

			`if (U_FAILURE(erc)) {`
			`//RESPONSE_ERROR_STREAM(FwCode::ConvertToUCharFailed)`
			`std::cerr`
			`<< "EC:UNICODE:error:" << erc`
			`<< ", " << u_errorName(erc)`
			`<< " from converting input:'" << input << "'";`
			`len = 0;`
			`return FwCode::ConvertToUCharFailed;`
			`}`
			`return FwCode::FwOk;`
			`}`

			`// Normalize an input string. Note that all three internal buffers will`
			`// be used by this operation, but by the time we finish, we'll be done`
			`// with them.`
			`FwCode::ResponseCode UCharUtilsImpl::`
			`normalize(const std::string &input, std::string &result /* out */)`
			`{`
			`LOG_METHOD();`

			`// convert our UTF-8 into UChar`
			`int32_t inLen = 0;`
			`FwCode::ResponseCode rc = convert(input, inLen);`

			`if (rc != FwCode::FwOk) {`
			`result.erase();`
			`return rc;`
			`}`

			`// Do a quick check if the input is already normalized so that`
			`// we can duck out early`
			`UErrorCode status = U_ZERO_ERROR;`
			`if (unorm_quickCheck(ucBuff, inLen,`
			`UNORM_NFC, &status) == UNORM_YES) {`
			`DHT_DEBUG_STREAM() << "already normalized input:" << input;`
			`result = input;`
			`return FwCode::FwOk;`
			`}`

			`// Check if we have enough space for the normalized result.`
			`// We'll make the output space twice as big as the input (although`
			`// it's more likely that the normalized result will be shorter`
			`// as it combines characters. E.g. 'A' 'put an accent on the previous'`
			`int32_t newSize = inLen * 2;`
			`if (newSize > ucNormBuffLen) {`
			`DHT_DEBUG_STREAM() << "newSize:" << newSize`
			`<< " ucNormBuffLen:" << ucNormBuffLen;`
			`if (ucNormBuff) {`
			`delete[] ucNormBuff;`
			`}`
			`ucNormBuffLen = newSize;`
			`ucNormBuff = new UChar[ucNormBuffLen];`
			`}`

			`// Do the actual normalization`
			`status = U_ZERO_ERROR;`
			`int32_t normLen = unorm_normalize(ucBuff, inLen,`
			`UNORM_NFC, 0,`
			`ucNormBuff,`
			`ucNormBuffLen,`
			`&status);`
			`if(U_FAILURE(status)) {`
			`//RESPONSE_ERROR_STREAM(FwCode::FwError)`
			`std::cerr`
			`<< "EC:UNICODE:error:" << status << ", " << u_errorName(status)`
			`<<" in unorm_normalize, inLen:" << inLen`
			`<< " ucNormBuffLen:" << ucNormBuffLen;`
			`return FwCode::FwError;`
			`}`

			`// Make sure we have some space to convert back to UTF-8`
			`int32_t resultLen = normLen * 4;`
			`if (resultLen > charBuffLen) {`
			`DHT_DEBUG_STREAM() << "resultLen:" << resultLen`
			`<< " charBuffLen:" << charBuffLen;`
			`if (charBuff) {`
			`delete[] charBuff;`
			`charBuff= NULL;`
			`}`
			`charBuffLen = resultLen;`
			`charBuff = new char[charBuffLen];`
			`}`

			`DHT_DEBUG_STREAM() <<"calling ucnv_fromUChars, normLen:" << normLen;`

			`// Go from UChar array to UTF-8`
			`int32_t actualLen = ucnv_fromUChars(uconv_,`
			`charBuff, charBuffLen,`
			`ucNormBuff, normLen,`
			`&status);`
			`if(U_FAILURE(status)) {`
			`//RESPONSE_ERROR_STREAM(FwCode::FwError)`
			`std::cerr`
			`<< "EC:UNICODE:error:" << status << ", " << u_errorName(status)`
			`<< " in ucnv_fromUChars charBuffLen:" << charBuffLen`
			`<< " normLen:" << normLen;`
			`return FwCode::FwError;`
			`}`

			`// Smack our UTF-8 characters into the result string`
			`result.assign(charBuff, actualLen);`
			`DHT_DEBUG_STREAM() << "leaving actualLen:" << actualLen`
			`<< " result:" << result;`
			`return FwCode::FwOk;`
			`}`


			`FwCode::ResponseCode UCharUtils::`
			`init()`
			`{`
			`if (instance_ == NULL) {`
			`instance_ = new UCharUtilsImpl();`
			`return instance_->init();`
			`}`
			`return FwCode::FwOk; // already initialized`
			`}`

			`void UCharUtils::`
			`close()`
			`{`
			`if(instance_ != NULL) {`
			`delete instance_;`
			`instance_ = NULL;`
			`}`
			`}`

			`// Given an input string, return a unicode UChar array. Note that the`
			`// return value is a pointer to our internal buffer.`
			`UChar * UCharUtils::`
			`getUChar(const std::string &input, int32_t& len) {`
			`LOG_METHOD();`

			`// do the conversion...somehow need 2x input len for utf8 to utf16`
			`if(instance_->convert(input, len) != FwCode::FwOk) {`
			`len = 0;`
			`return NULL;`
			`}`

			`return instance_->ucBuff;`
			`}`

			`FwCode::ResponseCode UCharUtils::`
			`normalize(const std::string &input, std::string &result) {`
			`LOG_METHOD();`
			`return(instance_->normalize(input, result));`
			`}`


			`FwCode::ResponseCode UCharUtils::`
			`parseRegExpPattern(const std::string &pattern,`
			`URegularExpression * & result /* out */)`
			`{`
			`UParseError perr;`
			`UErrorCode erc = U_ZERO_ERROR;`
			`int32_t ureglen = 0;`

			`// Do not delete uregexp, it's a static reusable buffer inside UCharUtils`
			`UChar *uregexp = UCharUtils::getUChar(pattern, ureglen);`
			`if (uregexp == NULL) {`
			`//RESPONSE_ERROR_STREAM(FwCode::ConvertToUCharFailed)`
			`std::cerr`
			`<< "EC:UNICODE\|IMPOSSIBLE:Unable to convert pattern to unicode: " << pattern;`
			`return FwCode::ConvertToUCharFailed;`
			`}`

			`URegularExpression *regexp= uregex_open(uregexp, ureglen, 0,`
			`&perr,`
			`&erc);`
			`if(erc != U_ZERO_ERROR) {`
			`//RESPONSE_DEBUG_STREAM(FwCode::CompileRegExFailed)`
			`std::cerr`
			`<< "Compiling regex failed at: " << perr.offset`
			`<< "; re=" << pattern;`
			`return FwCode::CompileRegExFailed;`
			`}`

			`result = regexp;`
			`return FwCode::FwOk;`
			`}`