//
//|
//| Copyright (c) 2001-2005
//| Andrew Fedoniouk - andrew@terrainformatica.com
//|
//|
//|
//|

#ifndef __tl_ustring_h
#define __tl_ustring_h
//|
//|
//| Copyright (c) 2001-2005
//| Andrew Fedoniouk - andrew@terrainformatica.com
//|
//| UTF16 COW string
//|
//|

#include "tl_array.h"
#include "tl_basic.h"
#include "tl_generator.h"
#include "tl_slice.h"
#include "tl_streams.h"
#include "tl_string_t.h"
#include "tl_string.h"
#include "ucdata/ucdata_lt.h"

namespace tool {

  typedef tool::string_t<wchar, char> ustring;
  typedef tool::string_chars_t<wchar, char> ustring_chars;

  template<> inline void to_lower(tslice<wchar> wc) {
    const wchar *end = wc.end();
    for (wchar *p = wc.start; p < end; ++p)
      *p = (wchar)uctolower(*p);
  }

  template<> inline void to_upper(tslice<wchar> wc) {
    const wchar *end = wc.end();
    for (wchar *p = wc.start; p < end; ++p)
      *p = (wchar)uctoupper(*p);
  }
 
  extern byte UTF8_BOM[3];
  extern byte UTF16LE_BOM[2];

  inline ustring xml_escape(wchars in) { return ustring(xml_escape_seq(in)()); }

  namespace u16 
  {
    uint getc(wchars &buf);
    uint putc(uint uc, wchar *W2 /*W[2]*/);

    // is a first code unit of surrogate pair in utf16 sequence
    inline bool is_suro_head(wchar c) { return c >= 0xD800 && c <= 0xDBFF; }
    // is a second code unit of surrogate pair in utf16 sequence
    inline bool is_suro_tail(wchar c) { return c >= 0xDC00 && c <= 0xDFFF; }
    // advance pos in buf with respect of surrogate pairs
    bool advance(const wchars &buf, int n, int &pos);

    inline uint codepoints(wchars utf16) {
      uint n = 0;
      while (utf16.length) {
        u16::getc(utf16);
        ++n;
      }
      return n;
    }

    inline void cvt(wchars in, array<uint> &out) {
      for (uint c = getc(in); c; c = getc(in))
        out.push(c);
    }

    ustring cvt(uint codepage, bytes text);
    inline ustring cvt(uint codepage, chars text) { return cvt(codepage,bytes((const byte*)text.start, text.length)); }
    string cvt(uint codepage, wchars text);

    $generator(each_codepoint) {
      wchars seq;
      each_codepoint(wchars chars) : seq(chars) {}

      $emit(uint) // will emit codepoints
          while (seq.length) $yield(getc(seq));
      $stop; // stop, end of sequence. end of the generator body.
    };
  } // namespace utf16

  namespace u8 {

    inline void putc(uint c, array<byte> &utf8out) {
  #undef APPEND
  #define APPEND(x) utf8out.push(byte(x))
      if (c < (1 << 7)) {
        APPEND(c);
      }
      else if (c < (1 << 11)) {
        APPEND((c >> 6) | 0xC0);
        APPEND((c & 0x3F) | 0x80);
      }
      else if (c < (1 << 16)) {
        APPEND((c >> 12) | 0xE0);
        APPEND(((c >> 6) & 0x3F) | 0x80);
        APPEND((c & 0x3F) | 0x80);
      }
      else if (c < (1 << 21)) {
        APPEND((c >> 18) | 0xF0);
        APPEND(((c >> 12) & 0x3F) | 0x80);
        APPEND(((c >> 6) & 0x3F) | 0x80);
        APPEND((c & 0x3F) | 0x80);
      }
  #undef APPEND
    }

    inline void putc(uint c, byte *utf8out, uint &num_bytes) {
  #undef APPEND
  #define APPEND(x)                                                              \
    *utf8out++ = byte(x);                                                        \
    ++num_bytes;

      if (c < (1 << 7)) {
        APPEND(c);
      }
      else if (c < (1 << 11)) {
        APPEND((c >> 6) | 0xC0);
        APPEND((c & 0x3F) | 0x80);
      }
      else if (c < (1 << 16)) {
        APPEND((c >> 12) | 0xE0);
        APPEND(((c >> 6) & 0x3F) | 0x80);
        APPEND((c & 0x3F) | 0x80);
      }
      else if (c < (1 << 21)) {
        APPEND((c >> 18) | 0xF0);
        APPEND(((c >> 12) & 0x3F) | 0x80);
        APPEND(((c >> 6) & 0x3F) | 0x80);
        APPEND((c & 0x3F) | 0x80);
      }
  #undef APPEND
    }

    inline bool putc(uint c, stream *utf8out) {
  #undef APPEND
  #define APPEND(x)                                                              \
    if (!utf8out->write(x))                                                      \
      return false;

      if (c < (1 << 7)) {
        APPEND(c);
      }
      else if (c < (1 << 11)) {
        APPEND((c >> 6) | 0xC0);
        APPEND((c & 0x3F) | 0x80);
      }
      else if (c < (1 << 16)) {
        APPEND((c >> 12) | 0xE0);
        APPEND(((c >> 6) & 0x3F) | 0x80);
        APPEND((c & 0x3F) | 0x80);
      }
      else if (c < (1 << 21)) {
        APPEND((c >> 18) | 0xF0);
        APPEND(((c >> 12) & 0x3F) | 0x80);
        APPEND(((c >> 6) & 0x3F) | 0x80);
        APPEND((c & 0x3F) | 0x80);
      }
  #undef APPEND
      return true;
    }

    uint  getc(bytes &buf);
    wchar getc(const bytes &buf, int &pos);
    int   getc(stream *f);

    inline void from_utf16(wchars utf16, array<byte> &utf8out, bool prepend_bom = false) {
      if (prepend_bom)
        utf8out.push(UTF8_BOM,3);
      const wchar *pc = utf16.start;
      const wchar *pc_end = utf16.end();
      for (; pc < pc_end; ++pc)
        putc(*pc, utf8out);
    }

    inline void from_utf16(const wchar *utf16, size_t utf16_length, array<byte> &utf8out, bool prepend_bom = false) { from_utf16(wchars(utf16, utf16_length), utf8out,prepend_bom); }

    bool to_utf16(bytes src, array<wchar> &buf, bool fail_on_fail);
    
    inline bool to_utf16(chars src, array<wchar> &buf, bool fail_on_fail) { return to_utf16(bytes((const byte*)src.start,src.length),buf, fail_on_fail); }

    inline string cvt(wchars text, bool prepend_bom = false) {
      array<byte> out;
      from_utf16(text, out, prepend_bom);
      return string(out());
    }

    inline string cvt(const ustring& text, bool prepend_bom = false) {
      array<byte> out;
      from_utf16(text(), out, prepend_bom);
      return string(out());
    }

    inline ustring cvt(bytes text) {
      array<wchar> out;
      to_utf16(text, out, false);
      return ustring(out());
    }

    inline ustring cvt(chars text) {
      array<wchar> out;
      to_utf16(to_bytes(text), out, false);
      return ustring(out());
    }

    inline ustring cvt(const string& text) {
      return cvt(text());
    }
    
    inline void x_from(wchars utf16, array<byte> &utf8out) {
      const wchar *pc = utf16.cbegin();
      const wchar *pc_end = utf16.cend();
      for (; pc < pc_end; ++pc) {
        switch (*pc) {
        case '<':
          utf8out.push((const byte *)"&lt;", 4);
          continue;
        case '>':
          utf8out.push((const byte *)"&gt;", 4);
          continue;
        case '&':
          utf8out.push((const byte *)"&amp;", 5);
          continue;
        case '"':
          utf8out.push((const byte *)"&quot;", 6);
          continue;
        case '\'':
          utf8out.push((const byte *)"&apos;", 6);
          continue;
        }
        putc(*pc, utf8out);
      }
    }

    inline void x_from_no_amp(wchars utf16, array<byte> &utf8out) {
      const wchar *pc = utf16.cbegin();
      const wchar *pc_end = utf16.cend();
      for (wchar c = *pc; pc < pc_end; c = *(++pc)) {
        switch (c) {
        case '<':
          utf8out.push((const byte *)"&lt;", 4);
          continue;
        case '>':
          utf8out.push((const byte *)"&gt;", 4);
          continue;
          // case '&': utf8out.push((const byte*)"&amp;",5); continue;
        case '"':
          utf8out.push((const byte *)"&quot;", 6);
          continue;
        case '\'':
          utf8out.push((const byte *)"&apos;", 6);
          continue;
        }
        putc(c, utf8out);
      }
    }
  }

  ucode html_unescape(chars name);
  inline ucode html_unescape(wchars name) { return html_unescape(string(name)); }

template<typename CT> 
inline array<wchar> html_unescape(slice<CT> src,array<wchar>& buf)
{
  while (src.length) {
    slice<CT> t = src.chop('&');
    buf.push(t);
    if (!src.length) break;
    slice<CT> e = src.chop(';');
    ucode uc = html_unescape(e);
    wchar w2[2];
    buf.push(slice<wchar>(w2, u16::putc(uc, w2)));
  }
  return buf;
}

enum WCHAR_CLASS {
  wcc_space,
  wcc_alpha,
  wcc_number,
  wcc_paren,
  wcc_punct,
  wcc_break,
  wcc_newline,
  wcc_soft_hyphen,
  wcc_ideograph,
  wcc_rtl_alpha,
  wcc_non_printable_space,
  wcc_forced_space,
  wcc_ltr_mark,
  wcc_rtl_mark,
  wcc_hangul,
  wcc_surrogate_pair,
};

inline WCHAR_CLASS wchar_class(wchar c) {
  if (c < 128) {
    switch (c) {
    case '\r':
      return wcc_break;

    case '\n':
      return wcc_newline;

    case ' ':
    case 0x9:
    case 0xB:
    case 0xC:
      return wcc_space;

    case '!':
    case '#':
    case '$':
    case '%':
    case '&':
    case '\'':
    case '*':
    case '+':
    case ',':
    case '-':
    case '.':
    case '/':
    case ':':
    case ';':
    case '<':
    case '=':
    case '>':
    case '?':
    case '\\':
    case '^':
    case '|':
      return wcc_punct;

    // case 0xAB:  //Left Double Guillemet
    // case 0xBB:  //Right Double Guillemet
    // case 0x8B:
    case '\"':
    case '(':
    case ')':
    case '[':
    case ']':
    case '{':
    case '}':
      return wcc_paren;

    case '0':
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9':
      return wcc_number;
    default:
      if (c < ' ')
        return wcc_space;
      return wcc_alpha;
    }
  }

  if (c >= 0x1100 && c <= 0x11FF)
    return wcc_hangul; // Unicode Hangul Jamo block

  if (c >= 0x3000) {
    if (c >= 0xD800 && c <= 0xDFFF)
      return wcc_surrogate_pair; // surrogate pair.

    if (c >= 0xFF10 && // FULLWIDTH DIGIT ZERO (U+FF10)
        c <= 0xFF19)   // FULLWIDTH DIGIT NINE (U+FF19)
      return wcc_number;
    if (c >= 0xFF21 && // FULLWIDTH LATIN CAPITAL LETTER A (U+FF21)
        c <= 0xFF3A)   // FULLWIDTH LATIN CAPITAL LETTER Z (U+FF3A)
      return wcc_alpha;

    if (c >= 0xAC00 && c <= 0xD7AF)
      return wcc_hangul;

    return wcc_ideograph;
  }

  if (c == SOFT_HYPHEN)
    return wcc_soft_hyphen;

  if (c == NBSP_CHAR)
    return wcc_forced_space; // wcc_space;

  if (c == 0x200b || c == 0x2009 || c == 0x2007 || c == 0x2006 || c == 0x2005 ||
      c == 0x2004)
    return wcc_non_printable_space;

  if (c == 0x200e) // DIRECTIONALITY_LEFT_TO_RIGHT
    return wcc_ltr_mark;
  if (c == 0x200f) // DIRECTIONALITY_RIGHT_TO_LEFT
    return wcc_rtl_mark;

  if ((c >= 0x0590 && c <= 0x05FF) // Hebrew
      || (c >= 0x0600 &&
          c <= 0x07FF)) // Arabic, Arabic Sup., N'ko, Syriac, Thaana/Thana
    // Tifinar ???
    return wcc_rtl_alpha;

  return wcc_alpha;
}

template<> inline void capitalize(tslice<wchar> wc) {
  bool         cap_next = true;
  const wchar *end = wc.end();
  for (wchar *p = wc.start; p < end; ++p) {
    WCHAR_CLASS wcc = wchar_class(*p);
    bool        is_delimiter = wcc == wcc_space || wcc == wcc_paren || wcc == wcc_punct || wcc == wcc_break || wcc == wcc_newline;
    if (is_delimiter)
      cap_next = true;
    else if (cap_next) {
      to_upper(tslice<wchar>(p, 1));
      cap_next = false;
    }
  }
}

template<>
inline void string_t<wchar, char>::set(slice<char> s) {
#ifndef UTF8_CHARS
  int uslen = MultiByteToWideChar(CP_THREAD_ACP, 0, s.start, s.size(), 0, 0);
  set_length(uslen);
  if (_data != null_data())
    MultiByteToWideChar(CP_ACP, 0, s.start, s.size(), head(), uslen);
#else
  array<wchar> buf;
  u8::to_utf16(s, buf, false);
  set_length(buf.size());
  target().copy(buf.cbegin(), buf.length());
#endif
}

template<>
inline string_t<wchar, char> string_t<wchar, char>::format_args(const wchar *fmt, va_list args) {

  struct os : public printf_output_stream {
    tool::array<wchar> buffer;
    virtual bool      out(int c) {
      buffer.push((wchar)c);
      return true;
    }
  } os;
  do_w_vsprintf_os(&os, fmt, args);
  return os.buffer();
}

// pool traits
struct ustring_ignore_case {
  static unsigned int hash(const ustring &e) {
    unsigned int h  = 0, g;
    const wchar *pc = (const wchar *)e.cbegin();
    while (*pc) {
      h = (h << 4) + to_lower(*pc++);
      if ((g = h & 0xF0000000) != 0)
        h ^= g >> 24;
      h &= ~g;
    }
    return h;
  }
  static bool equal(const ustring &l, const ustring &r) {
    return lexical::ci::eq(l(),r());
  }
  static ustring create(const ustring &key) { return key; }
};

inline bool stoi(const wchar *s, int &i) {
  // if( !s || !s[0] )
  //  return false;

  wchars str = chars_of(s);

  return parse_int(str, i);

  // if(!parse_int(str,i))
  //  return false;

  // wchar* end = 0;

  // int n = (int)wcstol(s,&end,0);
  // if( end && (*end == 0 || *end == '%'))
  //{
  //  i = n;
  //  return true;
  //}
  // return false;
}

inline int stoi(const wchar *s) {
  int n = 0;
  stoi(s, n);
  return n;
}

inline bool stof(const wchar *s, double &d) {
  wchars str = chars_of(s);
  return parse_real(str, d);
}

inline bool stof(const wchar *s, float &d) {
  wchars str = chars_of(s);
  return parse_real(str, d);
}

inline double stof(const wchar *s) {
  double n = 0;
  stof(s, n);
  return n;
}

} // namespace tool

#endif /* ustring_defined */
