﻿#ifndef __tl_markup_h__
#define __tl_markup_h__

//|
//| simple XML/HTML scanner/tokenizer
//|
//| (C) Andrew Fedoniouk @ terrainformatica.com
//|

#include "tl_array.h"
#include "tl_slice.h"
#include "tl_value_t.h"
#include "tl_tokenizer.h"

namespace tool {

extern ucode                   html_unescape(chars name);
typedef function<void(wchars)> found_wchars_t;

namespace markup {
using tool::instream;

enum $ {
  MAX_NAME_SIZE            = 256,
  CHARMARK_SELECTION_START = 0x86, // U+0086  START OF SELECTED AREA †
  CHARMARK_SELECTION_END   = 0x87  // U+0087  END OF SELECTED AREA   ‡
};

template <typename CHAR_TYPE> class scanner {
public:
  typedef CHAR_TYPE        char_type;
  typedef slice<CHAR_TYPE> token_value;

  enum token_type {
    TT_ERROR = -1,
    TT_EOF   = 0,

    TT_TAG_START, // <tag ...
                  //     ^-- happens here
    TT_TAG_END,   // </tag>
                  //       ^-- happens here

    TT_TAG_HEAD_END,
    // <tag ... >
    //           ^-- happens on non-empty tags here
    TT_EMPTY_TAG_END,
    // <tag ... />
    //            ^-- happens on empty tags here
    TT_ATTR, // <tag attr="value" >
             //                  ^-- happens here
    TT_TEXT,

    TT_COMMENT, // "<!--" ...value... "-->"
    TT_CDATA,   // "<![CDATA[" ...value... "]]>"
    TT_PI,      // <? .....  ?>
                //             ^-- happens after PI processing
    TT_WORD,    // in details mode these will be generated
    TT_SPACE,   // instead of TT_TEXT above

    TT_DOCTYPE, // <!DOCTYPE ...value... >

    TT_ENTITIY, // <!ENTITIY name "value" >

  };

public:
  scanner<CHAR_TYPE>(
      instream<CHAR_TYPE> &                    is,
      function<bool(chars, found_wchars_t cb)> entity_resolver    = nullptr,
      function<void(chars, wchars)>            entity_registrator = nullptr)
      : input(is), input_char(0), tag_name_length(0), c_scan(0),
        attr_name_length(0), entity_resolver_cb(entity_resolver),
        entity_registrator_cb(entity_registrator), token(),
        details_mode(), got_selection_start(false), got_selection_end(false) {
    c_scan = &scanner<CHAR_TYPE>::scan_body;
    memzero(tag_name);
    memzero(attr_name);
    // token_start_cb = [](int n) {};
    // token_end_cb = [](int n) {};
  }

  // get next token
  token_type get_token(bool details = false) {
    details_mode  = details;
    token_type tt = (this->*c_scan)();
    if (token_end_cb)
      token_end_cb(was_push_back() ? -1 : 0);
    return tt;
  }

  // get value of TT_WORD, TT_SPACE, TT_ATTR and TT_DATA
  token_value get_value() {
    value.push(0);
    value.pop();
    return value();
  }

  // get attribute name
  const chars get_attr_name() {
    attr_name[attr_name_length] = 0;
    return chars(attr_name, attr_name_length);
  }

  // get tag name
  const chars get_tag_name() {
    tag_name[tag_name_length] = 0;
    return chars(tag_name, tag_name_length);
  }

  int    get_line_no() const { return input.line_no; }
  string get_url() { return input.url; }

  bool get_cdata_until(token_value tail) {
    return scan_cdata_until(tail) == TT_CDATA;
  }

  // should be overriden to resolve entities, e.g. &nbsp;
  // virtual char_type  resolve_entity(const char* buf, int buf_size) {
  //  return html_unescape(chars(buf,buf_size));
  //}

  instream<CHAR_TYPE> &get_input() { return input; }

  inline bool was_push_back() const { return input_char != 0; }

  function<void(int)> token_start_cb; // if set will receive token start events
  function<void(int)> token_end_cb;   // if set will receive token start events

  inline bool saw_selection_start() const {
    if (got_selection_start) {
      got_selection_start = false;
      return true;
    }
    return false;
  }
  inline bool saw_selection_end() const {
    if (got_selection_end) {
      got_selection_end = false;
      return true;
    }
    return false;
  }

  inline int_v pos_selection_start() const {
    if (selection_start_pos.is_defined()) {
      int t = selection_start_pos;
      selection_start_pos.clear();
      return t;
    }
    return int_v();
  }
  inline int_v pos_selection_end() const {
    if (selection_end_pos.is_defined()) {
      int t = selection_end_pos;
      selection_end_pos.clear();
      return t;
    }
    return int_v();
  }

private: /* methods */
  typedef token_type (scanner::*scan)();
  scan c_scan; // current 'reader'

  /*
  // content 'readers'
  token_type  scan_body();
  token_type  scan_head();
  token_type  scan_comment();
  token_type  scan_cdata();
  token_type  scan_pi();
  token_type  scan_tag();

  char_type   skip_whitespace();
  void        push_back(char_type c);

  char_type   get_char();
  char_type   scan_entity();

  bool        is_whitespace(char_type c);
    



  void        append_value(char_type c);
  void        append_attr_name(char_type c);
  void        append_tag_name(char_type c);
  */

private: /* data */
  // enum state { TEXT = 0, MARKUP = 1, COMMENT = 2, CDATA = 3, PI = 4 };
  // state       where;
  token_type token;

  array<char_type> value;

  char tag_name[MAX_NAME_SIZE];
  int  tag_name_length;

  char                                     attr_name[MAX_NAME_SIZE];
  int                                      attr_name_length;
  function<bool(chars, found_wchars_t cb)> entity_resolver_cb;
  function<void(chars, wchars)>
      entity_registrator_cb; // parsed <!ENTITY name "value"> registrator

  instream<CHAR_TYPE> &input;
  char_type            input_char;
  //int                  line_no;
  bool                 details_mode;

  mutable bool got_selection_start;
  mutable bool got_selection_end;

  mutable int_v selection_start_pos;
  mutable int_v selection_end_pos;

  // case sensitive string equality test
  // s_lowcase shall be lowercase string
  inline bool equal(const char *s, const char *s1, size_t length) {
    switch (length) {
    case 8:
      if (s1[7] != s[7])
        return false;
    case 7:
      if (s1[6] != s[6])
        return false;
    case 6:
      if (s1[5] != s[5])
        return false;
    case 5:
      if (s1[4] != s[4])
        return false;
    case 4:
      if (s1[3] != s[3])
        return false;
    case 3:
      if (s1[2] != s[2])
        return false;
    case 2:
      if (s1[1] != s[1])
        return false;
    case 1:
      if (s1[0] != s[0])
        return false;
    case 0:
      return true;
    default:
      return strncmp(s, s1, length) == 0;
    }
  }

  inline token_type scan_body() {
    value.clear();
    bool ws = false;

    char_type c;

    for (c = get_char();; c = get_char()) {
      if (c == 0)
        return TT_EOF;
      if (c == '<')
        return scan_tag();
      if (c == '&') {
        scan_entity();
        c = 0;
        break;
      }
      ws = is_whitespace(c);
      break; // got it
    }

    if (!details_mode) {
      while (true) {
        if (saw_selection_start())
          selection_start_pos = value.size();
        if (saw_selection_end())
          selection_end_pos = value.size();
        if (c)
          value.push(c);
        c = get_char();
        if (c == 0) {
          push_back(c);
          break;
        }
        if (c == '<') {
          if (saw_selection_start())
            selection_start_pos = value.size();
          if (saw_selection_end())
            selection_end_pos = value.size();
          push_back(c);
          break;
        }
        if (c == '&') {
          scan_entity();
          c = 0;
        }
      }
      return TT_TEXT;
    }
    // details mode
    if (ws) {
      while (true) {
        if (c)
          value.push(c);
        c = get_char();
        if (c == 0) {
          push_back(c);
          break;
        }
        if (c == '<') {
          push_back(c);
          break;
        }
        if (!is_whitespace(c)) {
          push_back(c);
          break;
        }
        if (c == '&') {
          scan_entity();
          c = 0;
        }
      }
      return TT_SPACE;
    } else {
      while (true) {
        if (c)
          value.push(c);
        c = get_char();
        if (c == 0) {
          push_back(c);
          break;
        }
        if (c == '<') {
          push_back(c);
          break;
        }
        if (is_whitespace(c)) {
          push_back(c);
          break;
        }
        if (c == '&') {
          scan_entity();
          c = 0;
        }
      }
      return TT_WORD;
    }
  }

  inline token_type scan_head() {
    char_type c = skip_whitespace();

    attr_name_length = 0;
    value.clear();

    if (token_start_cb)
      token_start_cb(-1);

    switch (c) {
    case '>':
      c_scan = &scanner<char_type>::scan_body;
      return TT_TAG_HEAD_END;

    case '/': {
      char_type t = get_char();
      if (t == '>') {
        c_scan = &scanner<char_type>::scan_body;
        return TT_EMPTY_TAG_END;
      } else {
        push_back(t);
        return TT_ERROR;
      } // erroneous situtation - standalone '/'
    }

    case '(': {
      // h-smile special case: (name)
      int nested_parens = 0;
      append_attr_name(CHARS("name"));
      while (value.size() < MAX_NAME_SIZE) {
        char_type c = get_char();
        switch (c) {
        case 0:
          return TT_EOF;
        case '(':
          ++nested_parens;
          append_value(c);
          break;
        case ')':
          if (0 == nested_parens--)
            return TT_ATTR;
          // else fall through
        default:
          append_value(c);
          break;
        }
      }
      return TT_ERROR;
    }
    case '.':
      append_attr_name(CHARS("class"));
      goto COLLECT_UNQUOTED_VALUE;
    case '#':
      append_attr_name(CHARS("id"));
      goto COLLECT_UNQUOTED_VALUE;
    case '|':
      append_attr_name(CHARS("type"));
      goto COLLECT_UNQUOTED_VALUE;
    case '!':
      append_attr_name(CHARS("type"));
    COLLECT_UNQUOTED_VALUE:
      while (value.size() < MAX_NAME_SIZE) {
        c = get_char();
        if (!c)
          return TT_ERROR;
        if (is_whitespace(c))
          return TT_ATTR;
        if (c == '.' || c == '#' || c == '!' || c == '|' || c == '(' ||
            c == '/' || c == '>') {
          push_back(c);
          return TT_ATTR;
        }
        if (c == '&') {
          scan_entity();
          continue;
        }
        append_value(c);
      }
      return TT_ERROR;
      // case '@':
      //  attr_name_length = attr_name_length;

    default:
      break;
    }

    // attribute name...
    while (c != '=') {
      if (c == 0)
        return TT_EOF;
      if (c == '>' || c == '/') {
        push_back(c);
        return TT_ATTR;
      } // attribute without value (HTML style)
      if (attr_name_length && (c == '.' || c == '#' || c == '!' || c == '(')) {
        push_back(c);
        return TT_ATTR;
      }
      if (is_whitespace(c)) {
        c = skip_whitespace();
        if (c != '=') {
          push_back(c);
          return TT_ATTR;
        } // attribute without value (HTML style)
        else
          break;
      }
      if (c == '<')
        return TT_ERROR;
      append_attr_name(c);
      c = get_char();
    }

    c = skip_whitespace();
    // attribute value...

    if (c == '\"')
      for (c = get_char(); c; c = get_char()) {
        if (c == '\"')
          return TT_ATTR;
        if (c == '&') {
          scan_entity();
          continue;
        }
        append_value(c);
      }
    else if (c == '\'') // allowed in html
      for (c = get_char(); c; c = get_char()) {
        if (c == '\'')
          return TT_ATTR;
        if (c == '&') {
          scan_entity();
          continue;
        }
        append_value(c);
      }
    else if (c == '>') // attr= >
    {
      push_back(c);
      return TT_ATTR; // let it be empty attribute.
    } else            // scan token, allowed in html: e.g. align=center
    {
      append_value(c);
      for (c = get_char(); c; c = get_char()) {
        if (is_whitespace(c))
          return TT_ATTR;
        if (c == '/' || c == '>') {
          push_back(c);
          return TT_ATTR;
        }
        if (c == '&') {
          scan_entity();
          continue;
        }
        append_value(c);
      }
    }
    return TT_ERROR;
  }

  inline bool valid_tag_name_char(char_type ct) {
    if (is_alnum(ct))
      return true;

    auto tname = get_tag_name();

    if (!tname) { // ![CDATA[ and  !--
      if (ct == '-' || ct == '_' || ct == '!')
        return true;
    } else if (tname[0] == '!') { // ![CDATA[ and  !--
      if (ct == ':' || ct == '-' || ct == '_' || ct == '[')
        return true;
    } else {
      if (ct == ':' || ct == '-' || ct == '_')
        return true; // NOTE: XML allows '.' here but we do not
    }
    return false;
  }

  class char_receiver {
    array<CHAR_TYPE> &trg;

  public:
    char_receiver(array<char_type> &target) : trg(target) {}
    void append_value(char_type c) { trg.push(c); }
    void append_value(slice<char_type> c) { trg.push(c); }
    NONCOPYABLE(char_receiver);
  };

  // caller already consumed '<'
  // scan header start or tag tail
  inline token_type scan_tag() {
    tag_name_length = 0;

    if (token_start_cb)
      token_start_cb(-1);

    char_type c = get_char();
    if (c == '?')
      return scan_pi();
    else if (c == '(')
      return scan_output_0();

    bool is_tail = c == '/';
    if (is_tail)
      c = get_char();
    else if (!is_alpha(c) && c != '!') {
      value.push('<');
      push_back(c);
      return TT_TEXT;
    }

    while (c) {

      if (is_whitespace(c)) { /*c = skip_whitespace();*/
        break;
      }

      if (c == '/')
        break;

      if (c == '>')
        break;

      /*if( c == '!' && tag_name_length == 0 ) -- this logic is moved into
      valid_tag_name_char
        ;
      else */
      if (!valid_tag_name_char(c))
        break;

      append_tag_name(c);

      if (!is_tail) {
        chars tn(tag_name, tag_name_length);
        switch (tag_name_length) {
        case 3:
          if (tn == CHARS("!--"))
            return scan_comment();
          break;
        case 8:
          if (tn == CHARS("![CDATA["))
            return scan_cdata();
          else if (icmp(get_tag_name(), CHARS("!DOCTYPE")))
            return scan_doctype();
          break;
        case 7:
          if (entity_registrator_cb && icmp(get_tag_name(), CHARS("!ENTITY")))
            return scan_entity_decl();
          break;
        }
      }
      c = get_char();
    }

    if (c == 0)
      return TT_ERROR;

    if (is_tail) {
      if (c == '>') {
        //c = get_char(); push_back(c); // to trigger got_selection_start/end just after the tail.
        // commented out due to problems with 
        // <div><include src="test.svg"></include></div>
        // logic should be moved to this::peek_char() 
        return TT_TAG_END;
      }
      return TT_ERROR;
    } else
      push_back(c);

    c_scan = &scanner<CHAR_TYPE>::scan_head;
    return TT_TAG_START;
  }

  // skip whitespaces.
  // returns first non-whitespace char
  inline char_type skip_whitespace() {
    while (char_type c = get_char()) {
      if (!is_whitespace(c))
        return c;
    }
    return 0;
  }

  inline void push_back(char_type c) { input_char = c; }

  inline char_type _get_char() {
    char_type t;
    if (input_char) {
      t          = input_char;
      input_char = 0;
      return t;
    }
    t = input.get_char();
    /*if (t == '\r') {
      char_type tn = input.get_char();
      if (tn == '\n') { // reduce \r\n to just \n
        ++line_no;
        return tn;
      }
      else {
        push_back(tn);
        return '\r';
      }
    } else if (t == '\n') {
      // dangling \n is just \n
      ++line_no;
      return '\n';
    }*/
    return t;
  }

  inline char_type get_char() {
    char_type t = _get_char();
    if (t == CHARMARK_SELECTION_START) {
      got_selection_start = true;
      return get_char();
    } else if (t == CHARMARK_SELECTION_END) {
      got_selection_end = true;
      return get_char();
    }

    if (t == '\r')
      dbg_printf("get_char(0x%x)\n", t);

    return t;
  }

  inline bool is_valid_entity_name_char(char_type c) {
    if (is_alnum(c))
      return true;
    if (c == '.')
      return true;
    if (c == '-')
      return true;
    if (c == '_')
      return true;
    if (c == ':')
      return true;
    return false;
  }

  // caller consumed '&'
  template <class RCV> inline void scan_entity(RCV *receiver) {
    char      buf[512];
    uint      i = 0;
    char_type t = 0;
    for (; i < (items_in(buf) - 1); ++i) {
      t = get_char();
      if (t == 0)
        return; // EOF;
      buf[i] = char(t);
      if (t == ';')
        break;
      if (!is_valid_entity_name_char(t) && t != '#') {
        // bad entity;
        uint n = 0;
        receiver->append_value('&');
        for (; i && n < i; ++n)
          receiver->append_value(buf[n]);
        receiver->append_value(t);
        return;
      }
    }
    buf[i] = 0;

    chars name = chars(buf, i);

    ucode uc = html_unescape(name);
    if (uc) {
      if (uc > 0xffff) {
        wchar w2[2];
        u16::putc(uc, w2);
        receiver->append_value(w2[0]);
        receiver->append_value(w2[1]);
      } else
        receiver->append_value(char_type(uc));
      return;
    }

    if (entity_resolver_cb && entity_resolver_cb(name, [&](wchars val) {
          receiver->append_value(val);
        }))
      return;

    // no luck ...
    receiver->append_value('&');
    for (uint n = 0; n < i; ++n)
      receiver->append_value(buf[n]);
    receiver->append_value(';');
  }

  inline void scan_entity() { scan_entity(this); }

  inline bool is_whitespace(char_type c) {
    return c <= ' ' &&
           (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f');
  }

  inline void append_value(char_type c) { value.push(c); }

  inline void append_value(slice<char_type> c) { value.push(c); }

  inline void append_attr_name(char_type c) {
    if (attr_name_length < (MAX_NAME_SIZE - 1))
      attr_name[attr_name_length++] = char(c);
  }

  inline void append_attr_name(chars text) {
    while (char c = text++)
      append_attr_name(char_type(c));
  }

  inline void append_tag_name(char_type c) {
    if (tag_name_length < (MAX_NAME_SIZE - 1))
      tag_name[tag_name_length++] = char(c);
  }
  inline void append_tag_name(chars text) {
    while (char c = text++)
      append_tag_name(char_type(c));
  }

  inline token_type scan_comment() {
    while (true) {
      char_type c = get_char();
      if (c == 0)
        return TT_EOF;
      value.push(c);
      int value_length = value.size();
      if (value_length >= 3 && value[value_length - 1] == '>' &&
          value[value_length - 2] == '-' && value[value_length - 3] == '-') {
        value.size(value_length - 3);
        break;
      }
    }
    c_scan = &scanner<char_type>::scan_body;
    return TT_COMMENT;
  }

  inline token_type scan_cdata() {
    while (true) {
      char_type c = get_char();
      if (c == 0)
        return TT_EOF;
      value.push(c);
      int value_length = value.size();
      if (value_length >= 3 && value[value_length - 1] == '>' &&
          value[value_length - 2] == ']' && value[value_length - 3] == ']') {
        value.size(value_length - 3);
        break;
      }
    }
    c_scan = &scanner<char_type>::scan_body;
    return TT_CDATA;
  }

  inline token_type scan_cdata_until(token_value tail) {
    value.clear();
    bool was_cdata_header = false;
    while (true) {
      char_type c = get_char();
      if (c == 0)
        return TT_EOF;
      value.push(c);
      if (!was_cdata_header && value().ends_with(WCHARS("<![CDATA["))) 
      {
        was_cdata_header = true;
        value.length(value.length() - WCHARS("<![CDATA[").length);
        continue;
      }
      if (was_cdata_header && value().ends_with(WCHARS("]]>")))
      {
        value.length(value.length() - WCHARS("]]>").length);
        was_cdata_header = false;
        continue;
      }
      if (!was_cdata_header) 
      {
        wchars last = value().end(tail.length);
        if (icmp(last, tail)) {
          value.length(value.length() - tail.length);
          break;
        }
      }
    }
    c_scan = &scanner<char_type>::scan_body;
    return TT_CDATA;
  }

  inline token_type scan_pi() {
    while (true) {
      char_type c = get_char();
      if (c == 0)
        return TT_EOF;
      value.push(c);
      int value_length = value.size();

      if (value_length >= 2 && value[value_length - 1] == '>' &&
          value[value_length - 2] == '?') {
        value.size(value_length - 2);
        break;
      }
    }
    // c_scan = &scan_body;
    return TT_PI;
  }

  inline token_type scan_output_0() {
    // caller consumed '<(' preambula
    c_scan          = &scanner<char_type>::scan_output_1;
    tag_name_length = 0;
    append_tag_name(CHARS("output"));
    return TT_TAG_START;
  }

  inline token_type scan_output_1() {

    c_scan = &scanner<char_type>::scan_head;

    attr_name_length = 0;
    append_attr_name(CHARS("name"));

    value.clear();
    int nested_parens = 0;

    while (value.size() < MAX_NAME_SIZE) {
      char_type c = get_char();
      switch (c) {
      case 0:
        return TT_EOF;
      case '(':
        ++nested_parens;
        append_value(c);
        break;
      case ')':
        if (0 == nested_parens--)
          return TT_ATTR;
        // else fall through
      default:
        append_value(c);
        break;
      }
    }
    return TT_ERROR;
  }

  inline token_type scan_doctype() {
    while (true) {
      char_type c = get_char();
      if (c == 0)
        return TT_EOF;
      if (c == '>')
        break;
      value.push(c);
    }
    c_scan = &scanner<char_type>::scan_body;
    return TT_DOCTYPE;
  }

  inline token_type scan_entity_decl() // caller consumed <!ENTITYY
  {
    array<char>  entity_name;
    array<wchar> entity_value;

    auto parse_entity_value = [&](char_type c) -> char_type {
      char_type starting_c = c; // either \" or \'
      for (c = get_char(); c; c = get_char()) {
        if (c == starting_c)
          return c;
        if (c == '&') {
          char_receiver rcv(entity_value);
          scan_entity(&rcv);
          continue;
        }
        if (c == '>') {
          entity_value.insert(0, starting_c);
          return c;
        }
        entity_value.push(c);
      }
      return c;
    };

    auto parse_entity_name = [&](char_type c) -> char_type {
      // attribute name...
      while (is_valid_entity_name_char(c)) {
        entity_name.push(char(c));
        c = get_char();
      }
      return c;
    };

    char_type c = skip_whitespace();

    c = parse_entity_name(c);
    if (entity_name.is_empty()) {
      value.push(WCHARS("<!ENTITY "));
      value.push(c);
      return TT_TEXT;
    }
    c = skip_whitespace();
    if (c != '\'' && c != '\"') {
      value.push(WCHARS("<!ENTITY "));
      value.push(ustring(entity_name()));
      value.push(c);
      return TT_TEXT;
    }
    c = parse_entity_value(c);
    if (c != '\'' && c != '\"') {
    TAIL_ERROR:
      value.push(WCHARS("<!ENTITY "));
      value.push(ustring(entity_name()));
      value.push(' ');
      value.push(entity_value());
      value.push(c);
      return TT_TEXT;
    }
    c = skip_whitespace();
    if (c != '>')
      goto TAIL_ERROR;

    entity_registrator_cb(entity_name(), entity_value());

    return TT_ENTITIY;
  }

  NONCOPYABLE(scanner)
};

template <typename CHAR_TYPE> struct char_traits;

template <> struct char_traits<char> {
  static char get_char(bytes &buf) { return (char)buf++; }
};

template <> struct char_traits<wchar> {
  static wchar get_char(bytes &buf) {
    return static_cast<wchar>(u8::getc(buf));
  }
};

// utf-8 input stream

template <typename CHAR_TYPE> class mem_istream : public instream<CHAR_TYPE> {
  typedef CHAR_TYPE           char_type;
  typedef instream<CHAR_TYPE> super;

  bytes buf;

public:
  mem_istream(bytes text, const string &url) : super(url), buf(text) {}

  mem_istream(chars text, const string &url)
      : super(url), buf(bytes((const byte *)text.start, text.length)) {}

  virtual char_type get_char() {
    if (buf.length == 0)
      return 0;
    return char_traits<char_type>::get_char(buf);
  }
};

class mem_ostream {
  tool::array<byte> buf;

public:
  mem_ostream() {
    // utf8 byte order mark
    static unsigned char BOM[] = {0xEF, 0xBB, 0xBF};
    buf.push(BOM, sizeof(BOM));
  }

  // intended to handle only ascii-7 strings
  // use this for markup output
  mem_ostream &operator<<(const char *str) {
    buf.push((const byte *)str, int(strlen(str)));
    return *this;
  }

  // use UNICODE chars for value output
  mem_ostream &operator<<(const wchar *wstr) {
    const wchar *pc = wstr;
    for (unsigned int c = *pc; c; c = *(++pc)) {
      switch (c) {
      case '<':
        *this << "&lt;";
        continue;
      case '>':
        *this << "&gt;";
        continue;
      case '&':
        *this << "&amp;";
        continue;
      case '"':
        *this << "&quot;";
        continue;
      case '\'':
        *this << "&apos;";
        continue;
      }
      if (c < (1 << 7)) {
        buf.push(byte(c));
      } else if (c < (1 << 11)) {
        buf.push(byte((c >> 6) | 0xc0));
        buf.push(byte((c & 0x3f) | 0x80));
      } else if (c < (1 << 16)) {
        buf.push(byte((c >> 12) | 0xe0));
        buf.push(byte(((c >> 6) & 0x3f) | 0x80));
        buf.push(byte((c & 0x3f) | 0x80));
      } else if (c < (1 << 21)) {
        buf.push(byte((c >> 18) | 0xe0));
        buf.push(byte(((c >> 12) & 0x3f) | 0x80));
        buf.push(byte(((c >> 6) & 0x3f) | 0x80));
        buf.push(byte((c & 0x3f) | 0x80));
      }
    }
    return *this;
  }

  mem_ostream &operator<<(chars str) {
    buf.push((const byte *)str.start, str.length);
    return *this;
  }

  void write(const char *str, size_t str_length) {
    buf.push((const byte *)str, int(str_length));
  }

  tool::array<byte> &data() { return buf; }

  operator const char *() {
    if (buf.size() == 0) {
      buf.push(0);
      return (const char *)buf.head();
    }
    if (buf.last() != 0)
      buf.push(0);
    return (const char *)buf.head();
  }
  size_t size() const { return buf.size(); }

  tool::chars chars() {
    return tool::chars(this->operator const char *(), size());
  }
};

} // namespace markup

} // namespace tool

#endif
