/*
 *
 * cs_url.cpp
 *
 * Copyright (c) 2001, 2002
 * Andrew Fedoniouk - andrew@terra-informatica.org
 * Portions: Serge Kuznetsov -  kuznetsov@deeptown.org
 *
 * See the file "COPYING" for information on usage
 * and redistribution of this file
 *
 */

#include "config.h"
#include "tl_url.h"
#include "tl_wregexp.h"
#include "tool.h"
#include <ctype.h>

#if defined(_WIN32)
  #include <shlwapi.h>
  #pragma comment(lib, "shlwapi.lib")
#else
  #define strnicmp strncasecmp
#endif

namespace tool {
struct protoport {
  chars  proto;
  int    port;
};

static protoport protoports[] = {
    {CHARS("ftp"), 21}, {CHARS("gopher"), 70}, {CHARS("http"), 80}, {CHARS("https"), 443},
    {CHARS("socks"), 1080}, {CHARS("svn"), 3690},  {CHARS("data"), -1},
};

/*
 * ParseURL
 *
 * Turns a URL into a URLParts structure
 *
 * The good stuff was written by Rob May <robert.may@rd.eng.bbc.co.uk>
 * and heavily mangled/modified by john to suit his own weird style.
 * Made somewhat smarter (err, completely re-written) by GN 1997May02
 */
bool url::parse(const char *src) {

  clear();

  this->src = src;
  this->src.replace_all('\\', '/');

  const char *s, *t;
  char *      fragmark; /* '#' fragment marker if any */
  /* NB Fragments  (which the chimera source calls 'anchors' are part
   * of HTML href's but _not_ properly speaking of URLs;  they are handled
   * entirely at the client end and not by the server.
   * Nevertheless we look for them  (this routine should really be called
   * ParseHREF)  and store a fragment identifier separately if we find one.
   * --GN
   */

  array<char> buffer;

  // t = start = buffer;
  /* RFC1738 says spaces in URLs are to be ignored -- GN 1997May02
     not anymore -- ANDREW FEDONIOUK
  */
  /*
  for ( s = src; *s; s++ )
    //if ( !isspace ( *s ) )
    if (  *s == '\\' )
      buffer.push ( '/' );
    else
      buffer.push ( *s );
  buffer.push ( '\0' );
  */

  buffer.push(this->src());
  buffer.push(0);

  char *start = buffer.begin();

  /* Lousy hack for URNs */
  if (strncmp(start, "urn:", 4) == 0) {
    protocol = "urn";
    filename = &buffer[4];
    return true;
  }
  /* Less lousy hack for URLs which say so */
  if (strncmp(start, "url:", 4) == 0)
    s = start + 4;
  else
    s = start;

  /*
   * Check to see if there is a protocol (scheme) name.
   * Matches /^[A-Za-z0-9\+\-\.]+:/ in PERLese.
   */
  for (t = s; *t; t++) {
    if (!isalnum(*t) && *t != '-' && *t != '+' && *t != '.')
      break;
  }
  if (*t == ':') {
    protocol = string(s, int(t - s));
    if (protocol.length() == 1) {
      // windows file names!
      protocol = "file";
      goto L1;
    }
    protocol.to_lower();

    for (uint i = 0; i < sizeof(protoports) / sizeof(protoport); i++)
      if (protocol == protoports[i].proto) {
        dport = port = protoports[i].port;
        break;
      }
    s = ++t;
    if (port == -1)
      goto data_scheme;
  }
  /*
   * Check whether this is an 'Internet' URL i.e. the next bit begins
   * with "//".  In this case, what follows up to the next slash ought
   * to parse as "//user:passwd@host.dom.ain:port/" with almost every
   * component optional, and we'll continue later with s pointing at the
   * trailing slash.  If there is no further slash, we'll add one and
   * return.-- None of the fields are supposed to contain any visible
   * (unencoded)  colons, slashes or atsigns.
   */
  if (s[0] == '/' && s[1] == '/') /* looking at "//" */
  {
    char *atsign; /* if present, user:passwd precedes it */
    char *colon;  /* colon separators after user or host */
    char *tslash; /* trailing slash */

    s += 2;

    protocol_path = true;

    if (protocol == CHARS("file")) {
      // dances around //, ///, //// and /////
      int numslashes = 0;
      for (const char *si = s; *si; ++si) {
        if (*si == '/')
          ++numslashes;
        else
          break;
      }
      switch (numslashes) {
      case 0:
#ifdef WINDOWS
        if (!chars_of(s).like("?:/*"))
          s -= 2;  // treat it as file://windowsnetshare/path
#endif // WINDOWS
        break; // ok - file://path
      case 1:
#ifndef POSIX
        --s;
#endif
        break; // historical problem - file:///path and file://path
      case 2:
        break; // win path - file:////win-net-path -> //win-net-path
      case 3:
        ++s;
        break; // win path - file://///win-net-path -> //win-net-path
      default:
        break; // ?
      }
      goto L1;
    } else {
      tslash = const_cast<char *>(strchr(s, '/'));
      if (tslash != nullptr)
        *tslash = '\0'; /* split the string, we'll undo this later */
    }

    atsign = const_cast<char *>(strchr(s, '@'));

    if (atsign != nullptr) /* a username is present, possibly empty */
    {
      *atsign = '\0'; /* split the string again */
      colon   = const_cast<char *>(strchr(s, ':'));

      if (colon != nullptr) /* a passwd is also present */
      {
        *colon   = '\0';
        password = colon + 1;
        password = unescape(password);
      }
      username = s;
      username = unescape(username);
      s        = atsign + 1;
    }

    if (*s == '[') // ipv6 , like http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html
    {
      ++s;
      char *closing = const_cast<char *>(strchr(s, ']'));
      if (closing != nullptr) {
        *closing = '\0';
        hostname = s;
        s = closing + 1;
      }
    } 
      
    colon = const_cast<char *>(strchr(s, ':'));
    if (colon != nullptr) /* a port is specified */
    {
       *colon = '\0';
       port = atoi(colon + 1);
    }

    if(*s)
      hostname = s;

    if (tslash == nullptr) /* nothing further */
    {
      if (protocol == CHARS("http") || protocol == CHARS("https") || protocol == CHARS("ftp"))
        filename = "/";
      // else if(protocol == "res")
      //{
      //  filename = hostname;
      //  hostname.clear();
      //}

      goto fillport;
    }
    *tslash = '/';    /* restore the slash */
    s       = tslash; /* and stay there, don't step beyond */
  }

  // request (GET) params
  fragmark = const_cast<char *>(strchr(s, '?'));
  if (fragmark != nullptr) {
    *fragmark = '\0';
    params    = fragmark + 1;
  }
L1:
  // end of special treatment of Internet URLs.
  // s points at filename part  (if any).
  fragmark = const_cast<char *>(strchr(s, '#'));
  if (fragmark != nullptr) {
    *fragmark = '\0';
    anchor    = fragmark + 1;
  }
  filename = s; /* everything else goes here */
fillport:
  hostname.to_lower();
  if (port == 0) {
    for (uint i = 0; i < sizeof(protoports) / sizeof(protoport); i++)
      if (protoports[i].proto == protocol) {
        port = protoports[i].port;
        break;
      }
  }
  return true;
data_scheme:
  filename = s;
  return true;
}

/*
 * escape URL
 *
 * Puts escape codes in URLs.  (More complete than it used to be;
 * GN Jan 1997.  We escape all that isn't alphanumeric, "safe" or "extra"
 * as spec'd in RFCs 1738, 1808 and 2068.)
 */
bool is_url_char(unsigned int c) {
  if (c > 128)
    return false;
  if (is_alnum(ucode(c)))
    return true;
  if (strchr("/:$-_.!*';,?&=@#%[]", c))
    return true;
  return false;
}

bool is_safe_url_char(unsigned int c) {
  if (c > 128)
    return false;
  if (is_alnum(ucode(c)))
    return true;
  if (strchr("-._~", c))
    return true;
  return false;
}

bool is_safe_file_url_char(unsigned int c) {
  if (c > 128)
    return false;
  if (is_alnum(ucode(c)))
    return true;
  if (strchr("/:-._~()[]!,", c))
    return true;
  return false;
}


bool url::looks_like_encoded(const string &s) {
  // const char* unsafe = " <>#{}|\\^~[]`";
  bool has_only_url_chars = true;
  bool has_percent        = false;
  for (int n = 0; n < s.size(); ++n) {
    char c = s[n];
    if (c == '%')
      has_percent = true;
    else if (!is_url_char(c))
      has_only_url_chars = false;
  }
  return has_percent && has_only_url_chars;
}

bool url::need_escapement(const ustring &s) {
  if (s().starts_with(WCHARS("data:")))
    return false;
  for (int n = 0; n < s.size(); ++n) {
    wchar c = s[n];
    if (!is_url_char(c))
      return true;
  }
  return false;
}

string url::escape(chars src, bool space_to_plus, bool norm_slash) {
  static const char *hex = "0123456789ABCDEF";

  array<char> buffer;

  space_to_plus = false; // NOTE: space_to_plus=true causes troubles if path itself contains '+'

  for (;!!src; ++src) {
    if (*src == ' ' && space_to_plus)
      buffer.push('+');
    else if (*src == '\\' && norm_slash)
      buffer.push('/');
    else if (is_url_char((unsigned char)*src) ||
             (*src == '+' && !space_to_plus)) {
      buffer.push(*src);
    } else {
      buffer.push('%');
      buffer.push(hex[(unsigned char)*src / 16]);
      buffer.push(hex[(unsigned char)*src % 16]);
    }
  }

  return buffer();
}

string url::escape_param(chars src) {
  const char *       cp  = src.start;
  static const char *hex = "0123456789ABCDEF";

  array<char> buffer;

  for (; cp < src.end(); cp++) {
    if (is_safe_url_char(*cp)) {
      buffer.push(*cp);
    } else {
      buffer.push('%');
      buffer.push(hex[(unsigned char)*cp / 16]);
      buffer.push(hex[(unsigned char)*cp % 16]);
    }
  }
  return buffer();
}

string url::escape_param(wchars src) { return escape_param(u8::cvt(src)()); }

string url::escape_file_path(chars src) {
  const char *       cp = src.start;
  static const char *hex = "0123456789ABCDEF";

  array<char> buffer;

  for (; cp < src.end(); cp++) {
    if (is_safe_file_url_char(*cp)) {
      buffer.push(*cp);
    }
    else {
      buffer.push('%');
      buffer.push(hex[(unsigned char)*cp / 16]);
      buffer.push(hex[(unsigned char)*cp % 16]);
    }
  }
  return buffer();
}

string url::escape_file_path(wchars src) { return escape_file_path(u8::cvt(src)()); }



/* When a new URI scheme defines a component that represents textual
   data consisting of characters from the Universal Character Set [UCS],
   the data should first be encoded as octets according to the UTF-8
   character encoding [STD63]; then only those octets that do not
   correspond to characters in the unreserved set should be percent-
   encoded.  For example, the character A would be represented as "A",
   the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
   as "%C3%80", and the character KATAKANA LETTER A would be represented
   as "%E3%82%A2". */

string url::escape(wchars src, bool space_to_plus, bool norm_slash) {
  string utf8 = u8::cvt(src);
  return escape(utf8, space_to_plus, norm_slash);
}

string url::path_to_file_url(ustring path) {
  if (path.like(W("file://*")))
    return path;
#ifdef WINDOWS
  if (PathIsRelative(path.c_str())) {
    path.replace_all('\\', '/');
    return escape_file_path(path());
  }
  else {
    path.replace_all('\\', '/');
    return CHARS("file://") + escape_file_path(path());
  }
#else 
  if (!path().starts_with('/'))
    return escape_file_path(path());
  else
    return CHARS("file://") + escape_file_path(path());
#endif 
  
}

ustring url::file_url_to_path(string u) {
  if (u.like("file://*"))
    return real_path(unescape(u(7)));
  if (u.like("home://*")) {
    ustring rp = url::unescape(u(7)); // home://
    return real_path(tool::get_home_dir(rp));
  }
  return u;
}

ustring url::file_url_to_path(ustring u) {
  if (u.like(W("file://*"))) 
    return real_path(unescape(u(7)));
  if (u.like(W("home://*"))) {
    ustring rp = url::unescape(u(7)); // home://
    return real_path(tool::get_home_dir(rp));
  }
  return u;
}


/*
 * UnescapeURL
 *
 * Converts the escape codes (%xx) into actual characters.
 * Does utf8 restoration.
 */
ustring url::unescape(chars src) {
  const char *cp;
  char        hex[3];

  array<byte> buffer;

  for (cp = src.start; cp < src.end(); cp++) {
    if (*cp == '%') {
      cp++;
      if (*cp == 0) {
        buffer.push('%');
        break;
      } else if (*cp == '%')
        buffer.push(*cp);
      else {
        hex[0] = *cp;
        cp++;
        if (*cp == 0) {
          buffer.push('%');
          buffer.push(hex[0]);
        }
        hex[1] = *cp;
        hex[2] = '\0';
        if (is_xdigit(hex[0]) && is_xdigit(hex[1]))
          buffer.push((char)strtol(hex, nullptr, 16));
        else {
          buffer.push('%');
          buffer.push((const byte *)hex, 2);
        }
      }
    } else
      buffer.push(*cp);
  }
  return u8::cvt(buffer());
}

inline bool is_path_delim(int c) { return (c == '/'); }

index_t common_path(const string &p1, const string &p2) {
  index_t i = 0, p1_len = p1.size(), p2_len = p2.size();
  while (i < p1_len && i < p2_len && toupper(p1[i]) == toupper(p2[i]))
    ++i;
  if ((i < p1_len && i < p2_len) ||
      (i < p1_len && !is_path_delim(p1[i]) && i == p2_len) ||
      (i < p2_len && !is_path_delim(p2[i]) && i == p1_len)) {
    if (i)
      --i; // here was the last match
    while (i && (p1[i] != '/') && (p1[i] != '\\'))
      --i; // && (p1[i] != '#')
           //    if (i) --i;     // here was the last /
  }
  return i;
}

// make relative path out of two absolute paths
string relpath(const string &abspath, const string &basepath)
// makes relative path out of absolute path. If it is deeper than basepath,
// it's easy. If basepath and abspath share something (they are all deeper
// than some directory), it'll be rendered using ..'s. If they are completely
// different, then the absolute path will be used as relative path.
{
  index_t abslen  = abspath.size();
  index_t baselen = basepath.size();

  index_t i = common_path(abspath, basepath);

  if (i == 0) {
    // actually no match - cannot make it relative
    return abspath;
  }

  // Count how many dirs there are in basepath above match
  // and append as many '..''s into relpath
  string  buf;
  index_t j = i + 1;

  while (j < baselen) {
    if (basepath[j] == '/') {
      if (j + 1 == baselen)
        break;
      buf += "../";
    }
    ++j;
  }

  // append relative stuff from common directory to abspath
  if (abspath[i] == '/')
    ++i;
  for (; i < abslen; ++i)
    buf += abspath[i];
  // remove trailing /
  if (buf.size() && (buf[buf.size() - 1] == '/'))
    buf.length(buf.length() - 1);
  // substitute empty with .
  if (buf.length() == 0)
    buf = '.';
  return buf;
}

string url::relative(const url &href) const {
  if (href.protocol != protocol)
    return href.src;
  if (href.hostname != hostname)
    return href.src;
  if (href.port != port)
    return href.src;

  index_t abslen  = href.filename.size();
  index_t baselen = filename.size();

  string buf;

  index_t i = common_path(href.filename, filename);

  if (i == 0) // root-rel
  {
    if (href.filename.length() && (href.filename[0] == '/')) {
      // ATTN! Bug fix temporary here.
      if (href.filename != CHARS("/"))
        buf = href.filename;
    } else if (href.filename.length()) {
      if (href.hostname.length())
        buf = "/";
      buf += href.filename;
    }
  } else if (href.filename.length() != filename.length() ||
             (filename.size() != i)) {
    // Count how many dirs there are in basepath above match
    // and append as many '..''s into relpath
    index_t j = i + 1;
    while (j < baselen) {
      if (filename[j] == '/') {
        if (j + 1 == baselen)
          break;
        buf += "../";
      }
      ++j;
    }
    // append relative stuff from common directory to abspath
    if (href.filename[i] == '/')
      ++i;
    for (; i < abslen; ++i)
      buf += href.filename[i];
    // remove trailing /
    if (buf.size() && (buf[buf.size() - 1] == '/'))
      buf.length(buf.length() - 1);
    // substitute empty with .
    if (buf.length() == 0)
      buf = '.';
  }
  if (href.params.length()) {
    buf += "?";
    buf += href.params;
  }
  if (href.anchor.length()) {
    buf += "#";
    buf += href.anchor;
  }
  return buf;
}

string url::dir() const {
  if (filename.is_empty())
    return filename;
  index_t lastslashpos = filename().last_index_of('/');
  if (lastslashpos <= 0)
    return string();
  return filename().sub(0, lastslashpos + 1);
}

string url::name_ext() const {
  if (filename.is_empty())
    return filename;
  index_t lastslashpos = filename().last_index_of('/');
  if (lastslashpos < 0)
    return filename;
  return filename().sub(lastslashpos + 1);
}

string url::name() const {
  string ne = name_ext();
  if (ne.is_empty())
    return ne;
  index_t lastdotpos = ne().last_index_of('.');
  if (lastdotpos < 0)
    return ne;
  return ne().sub(0, lastdotpos);
}

string url::ext() const {
  string ne = name_ext();
  if (ne.is_empty())
    return ne;
  index_t lastdotpos = ne().last_index_of('.');
  if (lastdotpos < 0)
    return string();
  return ne().sub(lastdotpos + 1);
}

void url::normalize_path() {
  if (filename.is_empty())
    return;

  bool initialslash = filename[0] == '/';
  bool lastslash = filename.size() > 1 && filename[filename.size() - 1] == '/';

  array<string> path;
  chars         comp;

  tokens<char> tz(filename(), CHARS("/"));

  while (tz.next(comp)) {
    if (!comp || comp == CHARS("."))
      continue;
    if (comp != CHARS(".."))
      path.push(comp);
    else if (path.size())
      path.pop();
  }

  filename.clear();
  if (initialslash)
    filename += '/';
  if (path.size()) {
    for (int i = 0; i < path.size() - 1; ++i) {
      filename += path[i];
      filename += '/';
    }
    filename += path.last();
  }
  if (filename.length() && lastslash)
    filename += '/';
}

void url::absolute(const url &abs) {
  if (is_absolute())
    return; // nothing to do

  if (protocol.length() && (protocol != abs.protocol))
    return; // nothing to do

  protocol      = abs.protocol;
  protocol_path = abs.protocol_path;

  if (!abs.is_absolute() ||
      !abs.protocol_path) // seems like "flat" url schema used as a base.
    return;               // we update only protocol part keeping rest intact

  dport    = abs.dport;
  port     = abs.port;
  hostname = abs.hostname;
  if (filename.length() == 0)
    filename = abs.dir();
  else if (filename.length() && filename[0] != '/')
    filename = abs.dir() + filename;
  normalize_path();
}

string abspath(const string &abspath, const string &relpath) {
  url t(relpath);
  url a(abspath);

  t.absolute(a);

  return t.compose();
}

string url::compose(bool only_resource_name, bool no_anchor) const {
  string out;

  if (!only_resource_name) {
    if (is_absolute()) {
      if (!protocol.is_empty()) {
        out += protocol;
        out += ':';
      }
      if (is_external() || protocol_path) // internet url
        out += "//";

      out += hostname;

      if (dport && (dport != port))
        out += string::format(":%d", port);

      if (!filename.like("/*") && hostname.length())
        out += '/';
    } else if (is_local()) {
      out += "file://";
    } else if (!protocol.is_empty()) {
      out += protocol;
      out += ':';
    }
  }

  out += filename;

  if (params.length()) {
    out += "?";
    out += params;
  }

  if (!no_anchor && anchor.length()) {
    out += "#";
    out += anchor;
  }
  // what about username/password, eh?
  return out;
}

string url::compose_host() const {
  string out;

  if (is_external()) {
    if (!protocol.is_empty()) {
      out += protocol;
      out += ':';
    }
    if (is_external() || protocol_path) // internet url
      out += "//";

    out += hostname;

    if (dport && (dport != port))
      out += string::format(":%d", port);
  }
  else if (is_local()) 
  {
    out += "file://";
  }
  else if (!protocol.is_empty()) {
    out += protocol;
    out += ':';
  }

  return out;
}

string url::compose_object() const {
  string out;

  out += filename;
  if (params.length()) {
    out += "?";
    out += params;
  }
  return out;
}

#if 0

// Filter TCP/IP addresses
#define RE_TCP_IP_ADDR_NAME WTEXT("[_a-zA-Z0-9\\-]+([\\.]+[_a-zA-Z0-9\\-]+)*")
#define RE_TCP_IP_ADDR_IP WTEXT("\\d+\\.\\d+\\.\\d+\\.\\d+")
#define RE_TCP_IP_ADDR WTEXT("(") RE_TCP_IP_ADDR_IP WTEXT("|") RE_TCP_IP_ADDR_NAME WTEXT(")")
// Filter an e-mail address
#define RE_EMAIL_ADDR WTEXT("[_a-zA-Z0-9\\-\\.]+@(") RE_TCP_IP_ADDR WTEXT(")")
// Filter an unix path
#define RE_UNIX_PATH WTEXT("(/[_a-zA-Z0-9\\.\\-]*)+")

#define RE_PARAMS WTEXT("(\\?[_a-zA-Z0-9\\&\\=\\%\\,\\-\\!\\(\\)\\{\\}]+)?")
#define RE_ANCHOR WTEXT("(\\#[_a-zA-Z0-9\\%]+)?")
// filters an URL - ftp or http.

#define RE_URL                                                                 \
  WTEXT("(ftp|https?)://(")                                                    \
  RE_TCP_IP_ADDR WTEXT(")(:[0-9]+)?(") RE_UNIX_PATH WTEXT(")*")                \
      RE_PARAMS RE_ANCHOR WTEXT("$")
#define RE_WWW WTEXT("www\\.")
#define RE_FTP WTEXT("ftp\\.")

tool::wregexp& re_canonic_url() {
  static tool::wregexp re((const wchar *) WTEXT("^") RE_URL);
  return re;
}
tool::wregexp& re_email() {
  static tool::wregexp re((const wchar *) WTEXT("^(mailto:)?(") RE_EMAIL_ADDR WTEXT(")") );
  return re;
}
tool::wregexp& re_www() {
  static tool::wregexp re((const wchar *)   WTEXT("^") RE_WWW RE_TCP_IP_ADDR_NAME WTEXT("(") RE_UNIX_PATH WTEXT(")*") RE_PARAMS RE_ANCHOR );
  return re;
}
tool::wregexp& re_ftp() {
  static tool::wregexp re((const wchar *)   WTEXT("^") RE_FTP RE_TCP_IP_ADDR_NAME WTEXT("(") RE_UNIX_PATH WTEXT(")*") );
  return re;
}

tool::wregexp& re_no_www() {
  static tool::wregexp re((const wchar *)  WTEXT("^") RE_TCP_IP_ADDR_NAME WTEXT("(") RE_UNIX_PATH WTEXT(")*") RE_PARAMS RE_ANCHOR );
  return re;
}

bool is_hyperlink_char(wchar uc) { return is_url_char(uc); }
/*{
  if(uc > 127)
    return false;
  if(strchr("/:$-_.!*'(),?&=@#",uc)) //  ".:/-_@?=%&#"
    return true;
  if(isalnum(uc))
    return true;
  return false;
}*/

bool is_hyperlink(const tool::ustring& text, tool::ustring& out, bool and_no_www)
  {
    if(re_canonic_url().exec(text))
    {
      out = text;
      return true;
    }

    if(re_email().exec(text))
    {
      if( icmp(re_email().get_match(1), WCHARS("mailto:")) )
      {
        out = text;
        return true;
      }
      out = W("mailto:") + text;
      return true;
    }

    if(re_www().exec(text))
    {
      out = W("http://") + text;
      return true;
    }

    if(re_ftp().exec(text))
    {
      out = W("ftp://") + text;
      return true;
    }
    if(and_no_www && re_no_www().exec(text))
    {
      out = W("http://") + text;
      return true;
    }
    return false;
  }

  bool is_hyperlink(tool::ustring& text)
  // might modify the text if successfully tested
  {
    tool::ustring in = text;
    return is_hyperlink(in,text,false);
  }

#endif

} // namespace tool
