//|
//|
//| Copyright (c) 2001-2005
//| Andrew Fedoniouk - andrew@terrainformatica.com
//|
//| UCS2 COW string
//|
//|

#include "config.h"
#include "tool.h"

#include "snprintf.h"
#include "tl_streams.h"
#include "tl_ustring.h"
#include "wctype.h"
#include <ctype.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>

#include <stdio.h>

namespace tool {

byte UTF8_BOM[3] = {0xEF, 0xBB, 0xBF};
byte UTF16LE_BOM[2] = { 0xFF,0xFE };

struct bad_utf8_sequence {
  int dummy;
};

ustring u16::cvt(uint codepage, bytes s) {
#ifdef WINDOWS
  int     uslen = MultiByteToWideChar(codepage, 0, (const char*)s.cbegin(), s.size(), 0, 0);
  ustring rs(wchar('\0'), uslen);
  if (rs.size() == uslen)
    MultiByteToWideChar(codepage, 0, (const char*)s.cbegin(), s.size(), rs.buffer(), uslen);
  return rs;
#else
#pragma TODO("codepage needs to be implemented")
  return u8::cvt(s);
  // size_t uslen = mbstowcs( 0, s, slen );
  // ustring rs(wchar('\0'),int(uslen));
  // mbstowcs( rs.head(), s, slen );
#endif
}

string u16::cvt(uint codepage, wchars s) {
#ifdef WINDOWS
  int n = WideCharToMultiByte(codepage, 0, s.start, s.size(), 0, 0, 0, 0);
  array<char> buffer(n);
  WideCharToMultiByte(codepage, 0, s.start, s.size(), buffer.begin(), buffer.size(), 0, 0);
  return buffer();
#else
#pragma TODO("codepage needs to be implemented")
  return u8::cvt(s);
  // size_t uslen = mbstowcs( 0, s, slen );
  // ustring rs(wchar('\0'),int(uslen));
  // mbstowcs( rs.head(), s, slen );
#endif
}


namespace u8 {

  inline uint get_next_utf8(unsigned int val) {
    // Check for the correct bits at the start.
    // assert((val & 0xc0) == 0x80);
    if ((val & 0xc0) != 0x80)
      throw bad_utf8_sequence(); // bad continuation of multi-byte UTF-8 sequence

                                 // Return the significant bits.
    return (val & 0x3f);
  }

  bool to_utf16(tool::bytes src, tool::array<wchar> &buf, bool fail_on_fail)
  {
    if (src.length == 0)
      return true;
    buf.reserve(src.length);
    const byte *pc = src.cbegin();
    const byte *last = src.cend();
    uint        b1;

    bool invalid = false;

    auto get_next_utf8 = [&invalid](const byte *pc,
      const byte *last) -> unsigned int {
      // Take a character from the buffer
      // or from the actual input stream.
      if (pc >= last) {
        invalid = true;
        return 0; // unfinished multi-byte UTF-8 sequence at EOF
      }

      unsigned int val = *pc;

      // Check for the correct bits at the start.
      if ((val & 0xc0) != 0x80) {
        // bad continuation of multi-byte UTF-8 sequence
        invalid = true;
        return 0; // unfinished multi-byte UTF-8 sequence at EOF
      }

      // return the significant bits.
      return (val & 0x3f);
    };

    while (pc < last) {
      b1 = *pc++;

      // Determine whether we are dealing
      // with a one-, two-, three-, or four-
      // byte sequence.
      if ((b1 & 0x80) == 0) {
        // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
        buf += (wchar)b1;
      }
      else if ((b1 & 0xe0) == 0xc0) {
        // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
        buf += (wchar)(((b1 & 0x1f) << 6) | get_next_utf8(pc++, last));
      }
      else if ((b1 & 0xf0) == 0xe0) {
        uint b2 = get_next_utf8(pc++, last);
        uint b3 = get_next_utf8(pc++, last);

        // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
        buf += (wchar)(((b1 & 0x0f) << 12) | (b2 << 6) | b3);

        if (buf.size() == 1 && buf[0] == 0xFEFF) // bom
          buf.size(0);

      }
      else if ((b1 & 0xf8) == 0xf0) {
        // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
        //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
        // (uuuuu = wwww + 1)

        uint b2 = get_next_utf8(pc++, last);
        uint b3 = get_next_utf8(pc++, last);
        uint b4 = get_next_utf8(pc++, last);

        if (sizeof(wchar) == 2) {
          buf += wchar(0xd800 |
            ((((b1 & 0x07) << 2) | ((b2 & 0x30) >> 4) - 1) << 6) |
            ((b2 & 0x0f) << 2) | ((b3 & 0x30) >> 4));
          buf += wchar(0xdc00 | ((b3 & 0x0f) << 6) | b4);
          // TODO: test that surrogate value is legal.
        }
        else
          buf += wchar(((b1 & 7) << 18) | ((b2 & 0x3f) << 12) |
          ((b3 & 0x3f) << 6) | (b4 & 0x3f));
      }
      else {
        invalid = true;
        // bad start for UTF-8 multi-byte sequence"
      }

      if (fail_on_fail && invalid)
        return false;
    }

    return !invalid;
  }

  int getc(stream *f) {
    unsigned int b1;
    bool         isSurrogate = false;

    int t = f->read();
    if (t == stream::EOS)
      return t;
    b1 = (unsigned int)t;
    isSurrogate = false;

    try {

      // Determine whether we are dealing
      // with a one-, two-, three-, or four-
      // byte sequence.
      if ((b1 & 0x80) == 0) {
        // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
        return (wchar)b1;
      }
      else if ((b1 & 0xe0) == 0xc0) {
        // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
        uint r = (b1 & 0x1f) << 6;
        r |= get_next_utf8(f->read());
        return (wchar)r;
      }
      else if ((b1 & 0xf0) == 0xe0) {
        // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
        uint r = (b1 & 0x0f) << 12;
        r |= get_next_utf8(f->read()) << 6;
        r |= get_next_utf8(f->read());
        return (wchar)r;
      }
      else if ((b1 & 0xf8) == 0xf0) {
        // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
        //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
        // (uuuuu = wwww + 1)
        isSurrogate = true;
        return L'?';
        /*
        int b2 = get_next_utf8(pc++);
        int b3 = get_next_utf8(pc++);
        int b4 = get_next_utf8(pc++);
        buf +=
        (wchar)(0xd800 |
        ((((b1 & 0x07) << 2) | ((b2 & 0x30) >> 4) - 1) << 6) |
        ((b2 & 0x0f) << 2) |
        ((b3 & 0x30) >> 4));
        buf +=
        (wchar)(0xdc | ((b3 & 0x0f) << 6) | b4);
        // TODO: test that surrogate value is legal.
        */
      }
      else {
        // assert(0);
        return L'?';
        // bad start for UTF-8 multi-byte sequence"
        // return ustring(&buf[0], buf.size());
      }

    }
    catch (bad_utf8_sequence) {
      return L'?';
    }
  }

  inline unsigned int getb(bytes &buf) {
    if (buf.length == 0)
      return 0;
    return buf++;
  }

  uint getc(bytes &buf) {
    unsigned int b1;
    bool         isSurrogate = false;

    b1 = getb(buf);
    if (!b1)
      return 0;
    isSurrogate = false;

    try {

      // Determine whether we are dealing
      // with a one-, two-, three-, or four-
      // byte sequence.
      if ((b1 & 0x80) == 0) {
        // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
        return (wchar)b1;
      }
      else if ((b1 & 0xe0) == 0xc0) {
        // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
        uint r = (b1 & 0x1f) << 6;
        r |= get_next_utf8(getb(buf));
        return (wchar)r;
      }
      else if ((b1 & 0xf0) == 0xe0) {
        // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
        uint r = (b1 & 0x0f) << 12;
        r |= get_next_utf8(getb(buf)) << 6;
        r |= get_next_utf8(getb(buf));
        return (wchar)r;
      }
      else if ((b1 & 0xf8) == 0xf0) {
        // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
        //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
        // (uuuuu = wwww + 1)
        isSurrogate = true;
        int b2 = get_next_utf8(getb(buf));
        int b3 = get_next_utf8(getb(buf));
        int b4 = get_next_utf8(getb(buf));
        return ((b1 & 7) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3f) << 6) |
          (b4 & 0x3f);
      }
      else {
        // assert(0);
        return L'?';
        // bad start for UTF-8 multi-byte sequence"
      }
    }
    catch (bad_utf8_sequence) {
      return L'?';
    }
  }

  inline unsigned int getb(const bytes &buf, int &pos) {
    if (uint(pos) >= buf.length)
      return 0;
    return buf[pos++];
  }

  // ATTN: UCS-2 only!
  wchar getc(const bytes &buf, int &pos) {
    unsigned int b1;
    bool         isSurrogate = false;

    b1 = getb(buf, pos);
    if (!b1)
      return 0;
    isSurrogate = false;

    try {

      // Determine whether we are dealing
      // with a one-, two-, three-, or four-
      // byte sequence.
      if ((b1 & 0x80) == 0) {
        // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
        return (wchar)b1;
      }
      else if ((b1 & 0xe0) == 0xc0) {
        // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
        uint r = (b1 & 0x1f) << 6;
        r |= get_next_utf8(getb(buf, pos));
        return (wchar)r;
      }
      else if ((b1 & 0xf0) == 0xe0) {
        // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
        uint r = (b1 & 0x0f) << 12;
        r |= get_next_utf8(getb(buf, pos)) << 6;
        r |= get_next_utf8(getb(buf, pos));
        return (wchar)r;
      }
      else if ((b1 & 0xf8) == 0xf0) {
        // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
        //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
        // (uuuuu = wwww + 1)
        isSurrogate = true;
        return L'?';
        /*
        int b2 = get_next_utf8(pc++);
        int b3 = get_next_utf8(pc++);
        int b4 = get_next_utf8(pc++);
        buf +=
        (wchar)(0xd800 |
        ((((b1 & 0x07) << 2) | ((b2 & 0x30) >> 4) - 1) << 6) |
        ((b2 & 0x0f) << 2) |
        ((b3 & 0x30) >> 4));
        buf +=
        (wchar)(0xdc | ((b3 & 0x0f) << 6) | b4);
        // TODO: test that surrogate value is legal.
        */
      }
      else {
        // assert(0);
        return L'?';
        // bad start for UTF-8 multi-byte sequence"
      }
    }
    catch (bad_utf8_sequence) {
      return L'?';
    }
  }
}


#include "html_entities_ph.h"

  wchar MSCP1252[] = {0,      0,      0x201A, 0x0192, 0x201E, 0x2026, 0x2020,
                      0x2021, 0,      0x2030, 0x0160, 0x2039, 0x0152, 0,
                      0,      0,      0,      0x2018, 0x2019, 0x201C, 0x201D,
                      0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A,
                      0x0153, 0,      0,      0x0178};

  ucode html_unescape(chars name) {
    ucode uc = 0;

    if (name.length < 2)
      uc = '?';
    else if (name.start[0] == '#') {
      // numeric value, char code
      const char *str  = name.start + 1;
      int         base = 10;
      if (*str == 'x') {
        str++;
        base = 16;
      }
      char *endptr;
      long  v = strtol(str, &endptr, base);
      if (*endptr == '\0') {
        uc = (ucode)v;
        if (uc >= 0x80 && uc <= 0x9F)
          return MSCP1252[uc - 0x80];
        return uc;
      }
    } else {
      const html_entity_def *pe =
          html_entities::find_def(name.start, uint(name.length));
      if (pe)
        return pe->value;
    }
    return uc;
  }

  namespace u16
  {
    // restore UCP (unicode code point) from utf16
    uint getc(wchars &buf) {
      if (buf.length == 0)
        return 0;
      wchar c = *buf.start;
      ++buf.start;
      --buf.length;
      if (c < 0xD800 || c > 0xDBFF)
        return c; // not a surrogate pair
      if (buf.length == 0) {
        assert(false); // surrogate pair is not complete
        return 0;
      }
      wchar nc = *buf.start;
      ++buf.start;
      --buf.length;
      return (c - 0xD800) * 0x400 + (nc - 0xDC00) + 0x10000;
    }
    // encode UCP to two UTF16 code units
    uint putc(uint U, wchar *W2 /*W[2]*/) {
      if (U >= 0x10FFFF) // 200000 ?
        return 0;        // wrong value of UCP.
      if (U < 0x10000) {
        W2[0] = wchar(U);
        return 1;
      }
      // W2[0] = wchar(0xD800 + (U >> 10));
      // W2[1] = wchar(0xDC00 | (U & 0x3FF));
      W2[0] = wchar((U - 0x10000) / 0x400 + 0xD800);
      W2[1] = wchar((U - 0x10000) % 0x400 + 0xDC00);

    #ifdef _DEBUG
      wchars tt(W2, 2);
      uint   tr = getc(tt);
      assert(tr == U);
    #endif

      return 2;
    }

    bool advance(const wchars &buf, int n, int &pos) {
      if (n >= 0)
        for (; n > 0; --n) {
          if (pos >= int(buf.length)) {
            pos = buf.size();
            break;
          }
          if (is_suro_head(buf[pos]))
            pos += 2;
          else
            ++pos;
        }
      else // n < 0 - backward
        for (n = -n; n > 0; --n) {
          if (--pos < 0) {
            pos = 0;
            break;
          }
          if (pos >= buf.size())
            continue;
          if (is_suro_tail(buf[pos])) {
            if (--pos < 0) {
              pos = 0;
              break;
            }
          }
        }
      return n == 0;
    }

  } // namespace u16

} // namespace tool

size_t utf8_to_wstr(const char* u8str, size_t u8str_length, wchar* out, size_t out_length) {
  tool::ustring r = tool::u8::cvt(tool::chars(u8str, u8str_length));
  tool::target(out, out_length).copy(r());
  return min(r.length(), out_length);
}

