
// character classification routines using inline and precompiled UNICODE.ORG
// database. idea is taken from UCDATA package of M. Leisher (C) 2010-2013
// Andrew Fedoniouk @ terrainformatica.com

#include "tool.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "../tl_hash_table.h"
#include "../tl_sync.h"
#include "ucdata_data.inl"
#include "ucdata_lt.h"

/*
 * A simple array of 32-bit masks for lookup.
 */
static unsigned long masks32[32] = {
    0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
    0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
    0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
    0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
    0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
    0x40000000, 0x80000000};

/**************************************************************************
 *
 * Support for the character properties.
 *
 **************************************************************************/

// extern unsigned long  _ucprop_size;
// extern unsigned short *_ucprop_offsets;
// extern unsigned long  *_ucprop_ranges;

static int _ucprop_lookup(unsigned long code, unsigned long n) {
  long l, r, m;

  /*
   * There is an extra node on the end of the offsets to allow this routine
   * to work right.  If the index is 0xffff, then there are no nodes for the
   * property.
   */
  if ((l = _ucprop_offsets[n]) == 0xffff)
    return 0;

  /*
   * Locate the next offset that is not 0xffff.  The sentinel at the end of
   * the array is the max index value.
   */
  for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
    ;

  r = _ucprop_offsets[n + m] - 1;

  while (l <= r) {
    /*
     * Determine a "mid" point and adjust to make sure the mid point is at
     * the beginning of a range pair.
     */
    m = (l + r) >> 1;
    m -= (m & 1);
    if (code > _ucprop_ranges[m + 1])
      l = m + 2;
    else if (code < _ucprop_ranges[m])
      r = m - 2;
    else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
      return 1;
  }
  return 0;
}

static int _ucprops(unsigned long code, unsigned long mask1,
                    unsigned long mask2, unsigned long *mask1_out,
                    unsigned long *mask2_out) {
  int           ret = 0;
  unsigned long i;

  if (mask1 == 0 && mask2 == 0)
    return ret;

  if (mask1_out) {
    *mask1_out = 0;
    for (i = 0; mask1 && i < 32; i++) {
      if ((mask1 & masks32[i]) && _ucprop_lookup(code, i)) {
        *mask1_out |= 1 << i;
        ret = 1;
      }
    }
  }

  if (mask2_out) {
    *mask2_out = 0;
    for (i = 32; mask2 && i < _ucprop_size; i++) {
      if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i)) {
        *mask2_out |= 1 << (i - 32);
        ret = 1;
      }
    }
  }
  return ret;
}

/*static int _ucisprop(unsigned long code, unsigned long mask1, unsigned long
mask2)
{
    unsigned long i;

    if (mask1 == 0 && mask2 == 0)
      return 0;

    for (i = 0; mask1 && i < 32; i++) {
        if ((mask1 & masks32[i]) && _ucprop_lookup(code, i))
          return 1;
    }

    for (i = 32; mask2 && i < _ucprop_size; i++) {
        if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i))
          return 1;
    }

    return 0;
}*/

struct char_table_rec {
  unsigned long mask1;
  unsigned long mask2;
  char_table_rec() : mask1(0), mask2(0) {}
};

int ucprops(unsigned long code, unsigned long mask1, unsigned long mask2,
            unsigned long *mask1_out, unsigned long *mask2_out) {
  // return _ucprops(code, mask1, mask2, mask1_out, mask2_out);

  if (mask1 == 0 && mask2 == 0)
    return 0;

  static tool::mutex char_table_guard;

  tool::critical_section _(char_table_guard);

  static tool::hash_table<uint, char_table_rec> char_table(8037);

  bool            created = false;
  char_table_rec &masks   = char_table.get_ref(uint(code), created);

  if (created)
    _ucprops(code, 0xFFFFFFFF, 0xFFFFFFFF, &masks.mask1, &masks.mask2);

  unsigned long v1 = masks.mask1 & mask1;
  unsigned long v2 = masks.mask2 & mask2;

  if (mask1_out)
    *mask1_out = v1;
  if (mask2_out)
    *mask2_out = v2;

  return v1 || v2;
}

int ucisprop(unsigned long code, unsigned long mask1, unsigned long mask2) {
  // return _ucisprop(code, mask1, mask2);
  return ucprops(code, mask1, mask2, 0, 0);
}

/**************************************************************************
 *
 * Support for case mapping.
 *
 **************************************************************************/

static unsigned long _uccase_lookup(unsigned long code, long l, long r,
                                    int field) {
  long m;

  /*
   * Do the binary search.
   */
  while (l <= r) {
    /*
     * Determine a "mid" point and adjust to make sure the mid point is at
     * the beginning of a case mapping triple.
     */
    m = (l + r) >> 1;
    m -= (m % 3);
    if (code > _uccase_map[m])
      l = m + 3;
    else if (code < _uccase_map[m])
      r = m - 3;
    else if (code == _uccase_map[m])
      return _uccase_map[m + field];
  }

  return code;
}

unsigned long uctoupper(unsigned long code) {
  int  field = 0;
  long l = 0, r = 0xffffffff;

  if (ucislower(code)) {
    /*
     * The character is lower case.
     */
    field = 1;
    l     = _uccase_len[0];
    r     = (l + _uccase_len[1]) - 3;
  } else if (ucistitle(code)) {
    /*
     * The character is title case.
     */
    field = 2;
    l     = _uccase_len[0] + _uccase_len[1];
    r     = _uccase_size - 3;
  }
  return (field) ? _uccase_lookup(code, l, r, field) : code;
}

unsigned long uctolower(unsigned long code) {
  int  field = 0;
  long l = 0, r = 0xffffffff;

  if (ucisupper(code)) {
    /*
     * The character is upper case.
     */
    field = 1;
    l     = 0;
    r     = _uccase_len[0] - 3;
  } else if (ucistitle(code)) {
    /*
     * The character is title case.
     */
    field = 2;
    l     = _uccase_len[0] + _uccase_len[1];
    r     = _uccase_size - 3;
  }
  return (field) ? _uccase_lookup(code, l, r, field) : code;
}

unsigned long uctotitle(unsigned long code) {
  int  field = 0;
  long l = 0, r = 0xffffffff;

  if (ucisupper(code)) {
    /*
     * The character is upper case.
     */
    l     = 0;
    r     = _uccase_len[0] - 3;
    field = 2;
  } else if (ucislower(code)) {
    /*
     * The character is lower case.
     */
    l     = _uccase_len[0];
    r     = (l + _uccase_len[1]) - 3;
    field = 2;
  }
  return (field) ? _uccase_lookup(code, l, r, field) : code;
}

/*enum WRITNG_SCRIPT
{
  WS_UNKNOWN   = 0,     // - Latin and the rest
  WS_CYRILLIC  = 1,     // - Cyrillic
  WSI_HANZI    = 2,     // - Chinese  (hanzi)
  WS_KANA     = 3,     // - Japanese (Kana)
  WSI_ARABIC   = 4,     // - Arabic
  WSI_HEBREW   = 5,     // - Hebrew
  WS_HANGUL   = 6,     // - Korean
};*/

struct range_def {
  WRITING_SCRIPT ws;
  unsigned long  start;
  unsigned long  end; // inclusive
};

range_def range_defs[] = {

    {WS_CYRILLIC, 0x0400, 0x04FF},
    {WS_HEBREW, 0x0590, 0x05FF},
    {WS_ARABIC, 0x0600, 0x06FF},
    {WS_HANGUL, 0x1100, 0x11FF},
    {WS_EMOTICON, 0x25FB, 0x25FE }, // ?????
    {WS_EMOTICON, 0x2600, 0x27EF }, // ?????
    {WS_EMOTICON, 0x2B00, 0x2BFF }, // ?????
    {WS_KANA, 0x3040, 0x309F}, // hiragana
    {WS_KANA, 0x30A0, 0x30FF}, // katakana
    {WS_HANGUL, 0x3130, 0x318F},
    {WS_KANA, 0x31F0, 0x31FF}, // katakana phonetic extensions
    {WS_HANGUL, 0x3200, 0x32FF},
    {WS_HANZI, 0x3400, 0x4DFF},
    {WS_HANZI, 0x4E00, 0x9FFF},
    {WS_HANGUL, 0xA960, 0xA97F},
    {WS_HANGUL, 0xAC00, 0xD7AF},
    {WS_HANGUL, 0xD7B0, 0xD7FF},
    {WS_HANZI, 0xF900, 0xFAFF},
    {WS_HANZI, 0xFF00, 0xFF64},
    {WS_KANA, 0xFF65, 0xFF9F}, // hiragana, half-widths
    {WS_HANGUL, 0xFFA0, 0xFFDC},
    {WS_HANZI, 0xFFE0, 0xFFEE},
    {WS_KANA, 0x1B000, 0x1B001}, // katakana archaic
    {WS_EMOTICON, 0x1F000, 0x1FFFF }, // ?????
    {WS_HANZI, 0x20000, 0x2A6DF},
    {WS_HANZI, 0x2F800, 0x2FA1F}};


WRITING_SCRIPT writing_script(unsigned long ucs4code) {
  // binary search
  int imin = 0;
  int imax = items_in(range_defs) - 1;

  while (imax >= imin) // continue searching while [imin,imax] is not empty
  {
    int imid = (imin + imax) / 2;
    if (ucs4code > range_defs[imid].end)
      imin = imid + 1;
    else if (ucs4code < range_defs[imid].start)
      imax = imid - 1;
    else
      return range_defs[imid].ws;
  }

  return WS_UNKNOWN;
}
