/*    picu_utf8.c
 *
 *    $Id: picu_utf8.c,v 1.2 2002/01/20 21:46:04 bstell Exp $
 *
 *    Copyright (c) 2000 Brian Stell
 *
 *    This package is free software and is provided ``as is'' without
 *    express or implied warranty. It may be used, redistributed and/or
 *    modified under the terms of the Perl Artistic License
 *    (see http://www.perl.com/perl/misc/Artistic.html)
 *
 */

#include "picu/picu_debug.h"
#include "picu/picu_utf8.h"
#include "unicode/unistr.h"
#include "unicode/unicode.h"

#define PICU_UTF8_MEM_COOKIE (10000 + __LINE__)

/*
 * I know these functions have equivalents in ICU
 * but for Perl I wanted really fast routines
 * (read: no malloc's)
 *
 * I know I'm reinventing the wheel but it
 * really is a rounder wheel.
 */
char *
utf8_append_uchar(char *s, UChar c)
{
    uint32_t val = (uint32_t)c;
    if (val <= 0x7F) {
        *s++ = (uint8_t)val;
    }
    else if (val <= 0x7FF) {
        *s++ = 0xC0 | (uint8_t)(val>>6);
        *s++ = 0x80 | (uint8_t)(val&0x3F);
    }
    else if (val <= 0xFFFF) {
        *s++ = 0xE0 | (uint8_t)(val>>12);
        *s++ = 0x80 | (uint8_t)((val>>6)&0x3F);
        *s++ = 0x80 | (uint8_t)(val&0x3F);
    }
    else if (val <= 0x10FFFF) {
        *s++ = 0xF0 | (uint8_t)(val>>18);
        *s++ = 0x80 | (uint8_t)((val>>12)&0x3F);
        *s++ = 0x80 | (uint8_t)((val>>6)&0x3F);
        *s++ = 0x80 | (uint8_t)(val&0x3F);
    }
    else {
        *s++ = 0xEF;
        *s++ = 0xBF;
        *s++ = 0xBF;
    }
    return(s);
}

int32_t
UCharStringToUTF8strlen(const UChar* src, int32_t str_len)
{
    int32_t len = 0;
    for (int i=0; i<str_len; i++) {
        len += UTF8_CHAR_LENGTH(src[i]);
    }
    return(len);
}

/*
 * This conversion could be done with the ICU converter but
 * since it will be done frequently this UCS2 -> UTF8 converter
 * is directly implememted so we can avoid malloc'ing a converter
 */
const UChar *
UCharStringToUTF8String(const UChar* src, int32_t str_len, 
                        char* dst, int32_t dst_len)
{
    const UChar *uchars_left = 0;
    uint32_t char_len;
    uint32_t val;

    dst_len -= 1; /* space for null terminator */
    for (int i=0; i<str_len; i++) {
        val = (uint32_t)src[i];
        char_len = UTF8_CHAR_LENGTH(val);
        dst_len -= char_len;
        if (dst_len >= 0) {
            UTF8_APPEND_UCHAR(dst, val);
        }
        else {
            if (uchars_left == 0) {
                    uchars_left = src - 1;
            }
        }
    }
    *dst = '\0';
    return(uchars_left);
}

#if 0
//UnicodeString *
//U8StringToUnicodeString(U8 *u8str, UChar **uchar_ptr)
//for memory efficency
//uchar_ptr is malloc'd by this routine but caller must free uchar_ptr
//(only efficient way to get data into a UnicodeString object not to malloc
//getting the UChar data into a UnicodeString is problematic:
// 1) one can get the UnicodeString convert the utf8 data 
//    but that involves creating malloc'ing) a converter
// 2) one can get the UnicodeString to pre-allocate the correct size
//    but the must use an accessor function to write the data
// 3) on can use setTo (like here) but then is responsible for
//    freeing the memory
// 4) one could  get the UnicodeString to pre-allocate the correct size
//    then use getBuffer to access the memory and behind the scenes
//    move the memory.
#endif

U8 *
UnicodeStringToU8String(UnicodeString &unistr, uint32_t *ret_len)
{
    const UChar *uchars, *uchars_left;
    int32_t str_len, u8_len;
    U8 * u8str;

    uchars = unistr.getBuffer();
    str_len = unistr.length();
    u8_len = UCharStringToUTF8strlen(uchars, str_len);
    u8str = (U8 *)malloc(u8_len) + 1;
    New(PICU_UTF8_MEM_COOKIE, u8str, u8_len+1, U8);
    uchars_left = UCharStringToUTF8String(uchars, str_len, 
                                          (char *)u8str, u8_len+1);
    if (uchars_left) {
        printf("did not convert the whole string, %s %d\n", __FILE__, __LINE__);
    }
    if (ret_len) {
        // even thought we null terminate return the strlen not array len
        *ret_len = u8_len;
    }
    return(u8str);
}

void
dump_CharString(const char *char_str)
{
    while (*char_str) {
        if (*char_str < 0x7F) {
            printf("%c", *char_str);
        }
        else {
            printf("\\x%02x", (*char_str)&0xFF);
        }
        char_str++;
    }
    printf("\\x%02x\n", *char_str);
}

void
dump_CharStringAsHex(const char *char_str)
{
    while (*char_str) {
        printf("\\x%02x", (*char_str)&0xFF);
        char_str++;
    }
    printf("\\x%02x\n", *char_str);
}

void
dump_U8String(const U8 *u8_str)
{
    while (*u8_str) {
        if (*u8_str < 0x7F) {
            printf("%c", *u8_str);
        }
        else {
            printf("\\x%02x ", *u8_str);
        }
        u8_str++;
    }
    printf("\\x%02x\n", *u8_str);
}

void
dump_U8StringAsHex(const U8 *u8_str)
{
    while (*u8_str) {
        printf("%02x ", *u8_str);
        u8_str++;
    }
    printf("\\x%02x\n", *u8_str);
}

void
dump_UCharString(const UChar *uChar_str, int32_t str_len)
{
    int i;
    for (i=0; i<str_len; i++) {
        if (uChar_str[i] < 0x7F) {
            printf("%c", uChar_str[i]);
        }
        else {
            printf("\\x%02x%02x ", (uChar_str[i])>>8, (uChar_str[i])&0xFF);
        }
    }
    printf("\\x%02x%02x\n", (uChar_str[i])>>8, (uChar_str[i])&0xFF);
}

