code/mutt_2charset_8c_source.html

#include "config.h"

#include <errno.h>

#include <iconv.h>

#include <langinfo.h>

#include <limits.h>

#include <stdbool.h>

#include <stdio.h>

#include <string.h>

#include "charset.h"

#include "buffer.h"

#include "list.h"

#include "logging2.h"

#include "memory.h"

#include "pool.h"

#include "queue.h"

#include "regex3.h"

#include "slist.h"

#include "string2.h"

#ifdef ENABLE_NLS

#include <libintl.h>

#endif


#ifndef EILSEQ

#define EILSEQ EINVAL

#endif


wchar_t ReplacementChar = '?';


bool CharsetIsUtf8 = false;


struct Lookup

{

  enum LookupType type;

  struct Regex regex;

  char *replacement;

  TAILQ_ENTRY(Lookup) entries;

};

TAILQ_HEAD(LookupList, Lookup);


static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);


struct IconvCacheEntry

{

  char *fromcode1;

  char *tocode1;

  iconv_t cd;

};


#define ICONV_CACHE_SIZE 16

static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE];

static int IconvCacheUsed = 0;


struct MimeNames

{

  const char *key;

  const char *pref;

};


static const struct MimeNames PreferredMimeNames[] = {

  // clang-format off

  { "ansi_x3.4-1968",        "us-ascii"      },

  { "iso-ir-6",              "us-ascii"      },

  { "iso_646.irv:1991",      "us-ascii"      },

  { "ascii",                 "us-ascii"      },

  { "iso646-us",             "us-ascii"      },

  { "us",                    "us-ascii"      },

  { "ibm367",                "us-ascii"      },

  { "cp367",                 "us-ascii"      },

  { "csASCII",               "us-ascii"      },


  { "csISO2022KR",           "iso-2022-kr"   },

  { "csEUCKR",               "euc-kr"        },

  { "csISO2022JP",           "iso-2022-jp"   },

  { "csISO2022JP2",          "iso-2022-jp-2" },


  { "ISO_8859-1:1987",       "iso-8859-1"    },

  { "iso-ir-100",            "iso-8859-1"    },

  { "iso_8859-1",            "iso-8859-1"    },

  { "latin1",                "iso-8859-1"    },

  { "l1",                    "iso-8859-1"    },

  { "IBM819",                "iso-8859-1"    },

  { "CP819",                 "iso-8859-1"    },

  { "csISOLatin1",           "iso-8859-1"    },


  { "ISO_8859-2:1987",       "iso-8859-2"    },

  { "iso-ir-101",            "iso-8859-2"    },

  { "iso_8859-2",            "iso-8859-2"    },

  { "latin2",                "iso-8859-2"    },

  { "l2",                    "iso-8859-2"    },

  { "csISOLatin2",           "iso-8859-2"    },


  { "ISO_8859-3:1988",       "iso-8859-3"    },

  { "iso-ir-109",            "iso-8859-3"    },

  { "ISO_8859-3",            "iso-8859-3"    },

  { "latin3",                "iso-8859-3"    },

  { "l3",                    "iso-8859-3"    },

  { "csISOLatin3",           "iso-8859-3"    },


  { "ISO_8859-4:1988",       "iso-8859-4"    },

  { "iso-ir-110",            "iso-8859-4"    },

  { "ISO_8859-4",            "iso-8859-4"    },

  { "latin4",                "iso-8859-4"    },

  { "l4",                    "iso-8859-4"    },

  { "csISOLatin4",           "iso-8859-4"    },


  { "ISO_8859-6:1987",       "iso-8859-6"    },

  { "iso-ir-127",            "iso-8859-6"    },

  { "iso_8859-6",            "iso-8859-6"    },

  { "ECMA-114",              "iso-8859-6"    },

  { "ASMO-708",              "iso-8859-6"    },

  { "arabic",                "iso-8859-6"    },

  { "csISOLatinArabic",      "iso-8859-6"    },


  { "ISO_8859-7:1987",       "iso-8859-7"    },

  { "iso-ir-126",            "iso-8859-7"    },

  { "ISO_8859-7",            "iso-8859-7"    },

  { "ELOT_928",              "iso-8859-7"    },

  { "ECMA-118",              "iso-8859-7"    },

  { "greek",                 "iso-8859-7"    },

  { "greek8",                "iso-8859-7"    },

  { "csISOLatinGreek",       "iso-8859-7"    },


  { "ISO_8859-8:1988",       "iso-8859-8"    },

  { "iso-ir-138",            "iso-8859-8"    },

  { "ISO_8859-8",            "iso-8859-8"    },

  { "hebrew",                "iso-8859-8"    },

  { "csISOLatinHebrew",      "iso-8859-8"    },


  { "ISO_8859-5:1988",       "iso-8859-5"    },

  { "iso-ir-144",            "iso-8859-5"    },

  { "ISO_8859-5",            "iso-8859-5"    },

  { "cyrillic",              "iso-8859-5"    },

  { "csISOLatinCyrillic",    "iso-8859-5"    },


  { "ISO_8859-9:1989",       "iso-8859-9"    },

  { "iso-ir-148",            "iso-8859-9"    },

  { "ISO_8859-9",            "iso-8859-9"    },

  { "latin5",                "iso-8859-9"    },  /* this is not a bug */

  { "l5",                    "iso-8859-9"    },

  { "csISOLatin5",           "iso-8859-9"    },


  { "ISO_8859-10:1992",      "iso-8859-10"   },

  { "iso-ir-157",            "iso-8859-10"   },

  { "latin6",                "iso-8859-10"   },  /* this is not a bug */

  { "l6",                    "iso-8859-10"   },

  { "csISOLatin6",           "iso-8859-10"   },


  { "csKOI8r",               "koi8-r"        },


  { "MS_Kanji",              "Shift_JIS"     },  /* Note the underscore! */

  { "csShiftJis",            "Shift_JIS"     },


  { "Extended_UNIX_Code_Packed_Format_for_Japanese",

                             "euc-jp"        },

  { "csEUCPkdFmtJapanese",   "euc-jp"        },


  { "csGB2312",              "gb2312"        },

  { "csbig5",                "big5"          },


  /* End of official brain damage.

   * What follows has been taken from glibc's localedata files.  */


  { "iso_8859-13",           "iso-8859-13"   },

  { "iso-ir-179",            "iso-8859-13"   },

  { "latin7",                "iso-8859-13"   },  /* this is not a bug */

  { "l7",                    "iso-8859-13"   },


  { "iso_8859-14",           "iso-8859-14"   },

  { "latin8",                "iso-8859-14"   },  /* this is not a bug */

  { "l8",                    "iso-8859-14"   },


  { "iso_8859-15",           "iso-8859-15"   },

  { "latin9",                "iso-8859-15"   },  /* this is not a bug */


  /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */

  { "latin0",                "iso-8859-15"   },  /* this is not a bug */


  { "iso_8859-16",           "iso-8859-16"   },

  { "latin10",               "iso-8859-16"   },  /* this is not a bug */


  { "646",                   "us-ascii"      },


  /* http://www.sun.com/software/white-papers/wp-unicode/ */


  { "eucJP",                 "euc-jp"        },

  { "PCK",                   "Shift_JIS"     },

  { "ko_KR-euc",             "euc-kr"        },

  { "zh_TW-big5",            "big5"          },


  /* seems to be common on some systems */


  { "sjis",                  "Shift_JIS"     },

  { "euc-jp-ms",             "eucJP-ms"      },


  /* If you happen to encounter system-specific brain-damage with respect to

   * character set naming, please add it above this comment, and submit a patch

   * to <neomutt-devel@neomutt.org> */


  { NULL, NULL },

  // clang-format on

};


static struct Lookup *lookup_new(void)

{

  return MUTT_MEM_CALLOC(1, struct Lookup);

}


static void lookup_free(struct Lookup **ptr)

{

  if (!ptr || !*ptr)

    return;


  struct Lookup *l = *ptr;

  FREE(&l->replacement);

  FREE(&l->regex.pattern);

  if (l->regex.regex)

    regfree(l->regex.regex);

  FREE(&l->regex.regex);

  FREE(&l->regex);


  FREE(ptr);

}


static const char *lookup_charset(enum LookupType type, const char *cs)

{

  if (!cs)

    return NULL;


  struct Lookup *l = NULL;


  TAILQ_FOREACH(l, &Lookups, entries)

  {

    if (l->type != type)

      continue;

    if (mutt_regex_match(&l->regex, cs))

      return l->replacement;

  }

  return NULL;

}


int mutt_ch_convert_nonmime_string(const struct Slist *const assumed_charset,

                                   const char *charset, char **ps)

{

  if (!ps)

    return -1;


  char *u = *ps;

  const size_t ulen = mutt_str_len(u);

  if (ulen == 0)

    return 0;


  const struct ListNode *np = NULL;

  STAILQ_FOREACH(np, &assumed_charset->head, entries)

  {

    char const *c = np->data;

    size_t n = mutt_str_len(c);

    char *fromcode = MUTT_MEM_MALLOC(n + 1, char);

    mutt_str_copy(fromcode, c, n + 1);

    char *s = mutt_strn_dup(u, ulen);

    int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);

    FREE(&fromcode);

    if (m == 0)

    {

      FREE(ps);

      *ps = s;

      return 0;

    }

    FREE(&s);

  }

  mutt_ch_convert_string(ps, mutt_ch_get_default_charset(assumed_charset),

                         charset, MUTT_ICONV_HOOK_FROM);

  return -1;

}


void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)

{

  if (!buf || !name)

    return;


  char in[1024] = { 0 };

  char scratch[1024 + 10] = { 0 };

  struct Buffer *canon = buf_pool_get();


  mutt_str_copy(in, name, sizeof(in));

  char *ext = strchr(in, '/');

  if (ext)

    *ext++ = '\0';


  if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))

  {

    buf_strcpy(canon, "utf-8");

    goto out;

  }


  /* catch some common iso-8859-something misspellings */

  size_t plen;

  if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))

    snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);

  else if ((plen = mutt_istr_startswith(in, "8859-")))

    snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);

  else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))

    snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);

  else if ((plen = mutt_istr_startswith(in, "iso8859-")))

    snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);

  else

    mutt_str_copy(scratch, in, sizeof(scratch));


  for (size_t i = 0; PreferredMimeNames[i].key; i++)

  {

    if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))

    {

      buf_strcpy(canon, PreferredMimeNames[i].pref);

      goto out;

    }

  }


  buf_strcpy(canon, scratch);

  buf_lower(canon); // for cosmetics' sake


out:

  if (ext && (*ext != '\0'))

  {

    buf_addch(canon, '/');

    buf_addstr(canon, ext);

  }


  mutt_str_copy(buf, buf_string(canon), buflen);

  buf_pool_release(&canon);

}


bool mutt_ch_chscmp(const char *cs1, const char *cs2)

{

  if (!cs1 || !cs2)

    return false;


  char buf[256] = { 0 };


  mutt_ch_canonical_charset(buf, sizeof(buf), cs1);


  int len1 = mutt_str_len(buf);

  int len2 = mutt_str_len(cs2);


  return mutt_istrn_equal(((len1 > len2) ? buf : cs2),

                          ((len1 > len2) ? cs2 : buf), MIN(len1, len2));

}


const char *mutt_ch_get_default_charset(const struct Slist *const assumed_charset)

{

  static char fcharset[128];

  const char *c = NULL;


  if (assumed_charset && (assumed_charset->count > 0))

    c = STAILQ_FIRST(&assumed_charset->head)->data;

  else

    c = "us-ascii";


  mutt_str_copy(fcharset, c, sizeof(fcharset));

  return fcharset;

}


char *mutt_ch_get_langinfo_charset(void)

{

  char buf[1024] = { 0 };


  mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));


  if (buf[0] != '\0')

    return mutt_str_dup(buf);


  return mutt_str_dup("iso-8859-1");

}


bool mutt_ch_lookup_add(enum LookupType type, const char *pat,

                        const char *replace, struct Buffer *err)

{

  if (!pat || !replace)

    return false;


  regex_t *rx = MUTT_MEM_CALLOC(1, regex_t);

  int rc = REG_COMP(rx, pat, REG_ICASE);

  if (rc != 0)

  {

    regerror(rc, rx, err->data, err->dsize);

    FREE(&rx);

    return false;

  }


  struct Lookup *l = lookup_new();

  l->type = type;

  l->replacement = mutt_str_dup(replace);

  l->regex.pattern = mutt_str_dup(pat);

  l->regex.regex = rx;

  l->regex.pat_not = false;


  TAILQ_INSERT_TAIL(&Lookups, l, entries);


  return true;

}


void mutt_ch_lookup_remove(void)

{

  struct Lookup *l = NULL;

  struct Lookup *tmp = NULL;


  TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)

  {

    TAILQ_REMOVE(&Lookups, l, entries);

    lookup_free(&l);

  }

}


const char *mutt_ch_charset_lookup(const char *chs)

{

  return lookup_charset(MUTT_LOOKUP_CHARSET, chs);

}


iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)

{

  char tocode1[128] = { 0 };

  char fromcode1[128] = { 0 };

  const char *tocode2 = NULL, *fromcode2 = NULL;

  const char *tmp = NULL;


  /* transform to MIME preferred charset names */

  mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);

  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);


  /* maybe apply charset-hooks and recanonicalise fromcode,

   * but only when caller asked us to sanitize a potentially wrong

   * charset name incoming from the wild exterior. */

  if (flags & MUTT_ICONV_HOOK_FROM)

  {

    tmp = mutt_ch_charset_lookup(fromcode1);

    if (tmp)

      mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);

  }


  /* check if we have this pair cached already */

  for (int i = 0; i < IconvCacheUsed; i++)

  {

    if (strcmp(tocode1, IconvCache[i].tocode1) == 0 &&

        strcmp(fromcode1, IconvCache[i].fromcode1) == 0)

    {

      iconv_t cd = IconvCache[i].cd;


      /* make room for this one at the top */

      struct IconvCacheEntry top = IconvCache[i];

      for (int j = i - 1; j >= 0; j--)

      {

        IconvCache[j + 1] = IconvCache[j];

      }

      IconvCache[0] = top;


      if (iconv_t_valid(cd))

      {

        /* reset state */

        iconv(cd, NULL, NULL, NULL, NULL);

      }

      return cd;

    }

  }


  /* not found in cache */

  /* always apply iconv-hooks to suit system's iconv tastes */

  tocode2 = mutt_ch_iconv_lookup(tocode1);

  tocode2 = tocode2 ? tocode2 : tocode1;

  fromcode2 = mutt_ch_iconv_lookup(fromcode1);

  fromcode2 = fromcode2 ? fromcode2 : fromcode1;


  /* call system iconv with names it appreciates */

  iconv_t cd = iconv_open(tocode2, fromcode2);


  if (IconvCacheUsed == ICONV_CACHE_SIZE)

  {

    mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n",

               IconvCache[IconvCacheUsed - 1].fromcode1,

               IconvCache[IconvCacheUsed - 1].tocode1);

    /* get rid of the oldest entry */

    FREE(&IconvCache[IconvCacheUsed - 1].fromcode1);

    FREE(&IconvCache[IconvCacheUsed - 1].tocode1);

    if (iconv_t_valid(IconvCache[IconvCacheUsed - 1].cd))

    {

      iconv_close(IconvCache[IconvCacheUsed - 1].cd);

    }

    IconvCacheUsed--;

  }


  /* make room for this one at the top */

  for (int j = IconvCacheUsed - 1; j >= 0; j--)

  {

    IconvCache[j + 1] = IconvCache[j];

  }


  IconvCacheUsed++;


  mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1);

  IconvCache[0].fromcode1 = strdup(fromcode1);

  IconvCache[0].tocode1 = strdup(tocode1);

  IconvCache[0].cd = cd;


  return cd;

}


size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,

                     char **outbuf, size_t *outbytesleft, const char **inrepls,

                     const char *outrepl, int *iconverrno)

{

  size_t rc = 0;

  const char *ib = *inbuf;

  size_t ibl = *inbytesleft;

  char *ob = *outbuf;

  size_t obl = *outbytesleft;


  while (true)

  {

    errno = 0;

    const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);

    if (ret1 != ICONV_ILLEGAL_SEQ)

      rc += ret1;

    if (iconverrno)

      *iconverrno = errno;


    if (ibl && obl && (errno == EILSEQ))

    {

      if (inrepls)

      {

        /* Try replacing the input */

        const char **t = NULL;

        for (t = inrepls; *t; t++)

        {

          const char *ib1 = *t;

          size_t ibl1 = strlen(*t);

          char *ob1 = ob;

          size_t obl1 = obl;

          iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);

          if (ibl1 == 0)

          {

            ib++;

            ibl--;

            ob = ob1;

            obl = obl1;

            rc++;

            break;

          }

        }

        if (*t)

          continue;

      }

      /* Replace the output */

      if (!outrepl)

        outrepl = "?";

      iconv(cd, NULL, NULL, &ob, &obl);

      if (obl)

      {

        int n = strlen(outrepl);

        if (n > obl)

        {

          outrepl = "?";

          n = 1;

        }

        memcpy(ob, outrepl, n);

        ib++;

        ibl--;

        ob += n;

        obl -= n;

        rc++;

        iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */

        continue;

      }

    }

    *inbuf = ib;

    *inbytesleft = ibl;

    *outbuf = ob;

    *outbytesleft = obl;

    return rc;

  }

}


const char *mutt_ch_iconv_lookup(const char *chs)

{

  return lookup_charset(MUTT_LOOKUP_ICONV, chs);

}


int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)

{

  if (!s || !from || !to)

    return -1;


  int rc = 0;

  iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);

  if (!iconv_t_valid(cd))

    return -1;


  size_t outlen = MB_LEN_MAX * slen;

  char *out = MUTT_MEM_MALLOC(outlen + 1, char);

  char *saved_out = out;


  const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);

  if (convlen == ICONV_ILLEGAL_SEQ)

    rc = errno;


  FREE(&saved_out);

  return rc;

}


int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)

{

  if (!ps)

    return -1;


  char *s = *ps;


  if (!s || (*s == '\0'))

    return 0;


  if (!to || !from)

    return -1;


  const char *repls[] = { "\357\277\275", "?", 0 };

  int rc = 0;


  iconv_t cd = mutt_ch_iconv_open(to, from, flags);

  if (!iconv_t_valid(cd))

    return -1;


  const char **inrepls = NULL;

  const char *outrepl = NULL;


  if (mutt_ch_is_utf8(to))

    outrepl = "\357\277\275";

  else if (mutt_ch_is_utf8(from))

    inrepls = repls;

  else

    outrepl = "?";


  const char *ib = s;

  size_t ibl = strlen(s);

  if (ibl >= (SIZE_MAX / MB_LEN_MAX))

  {

    return -1;

  }

  size_t obl = MB_LEN_MAX * ibl;

  char *buf = MUTT_MEM_MALLOC(obl + 1, char);

  char *ob = buf;


  mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);

  iconv(cd, 0, 0, &ob, &obl);


  *ob = '\0';


  FREE(ps);

  *ps = buf;


  mutt_str_adjust(ps);

  return rc;

}


bool mutt_ch_check_charset(const char *cs, bool strict)

{

  if (!cs)

    return false;


  if (mutt_ch_is_utf8(cs))

    return true;


  if (!strict)

  {

    for (int i = 0; PreferredMimeNames[i].key; i++)

    {

      if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||

          mutt_istr_equal(PreferredMimeNames[i].pref, cs))

      {

        return true;

      }

    }

  }


  iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);

  if (iconv_t_valid(cd))

  {

    return true;

  }


  return false;

}


struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)

{

  iconv_t cd = ICONV_T_INVALID;


  if (from && to)

    cd = mutt_ch_iconv_open(to, from, flags);


  struct FgetConv *fc = MUTT_MEM_CALLOC(1, struct FgetConv);

  fc->fp = fp;

  fc->cd = cd;


  if (iconv_t_valid(cd))

  {

    static const char *repls[] = { "\357\277\275", "?", 0 };


    fc->p = fc->bufo;

    fc->ob = fc->bufo;

    fc->ib = fc->bufi;

    fc->ibl = 0;

    fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;

  }


  return fc;

}


void mutt_ch_fgetconv_close(struct FgetConv **ptr)

{

  if (!ptr || !*ptr)

    return;


  FREE(ptr);

}


int mutt_ch_fgetconv(struct FgetConv *fc)

{

  if (!fc)

    return EOF;

  if (!iconv_t_valid(fc->cd))

    return fgetc(fc->fp);

  if (!fc->p)

    return EOF;

  if (fc->p < fc->ob)

    return (unsigned char) *(fc->p)++;


  /* Try to convert some more */

  fc->p = fc->bufo;

  fc->ob = fc->bufo;

  if (fc->ibl)

  {

    size_t obl = sizeof(fc->bufo);

    iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);

    if (fc->p < fc->ob)

      return (unsigned char) *(fc->p)++;

  }


  /* If we trusted iconv a bit more, we would at this point

   * ask why it had stopped converting ... */


  /* Try to read some more */

  if ((fc->ibl == sizeof(fc->bufi)) ||

      (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))

  {

    fc->p = 0;

    return EOF;

  }

  if (fc->ibl)

    memcpy(fc->bufi, fc->ib, fc->ibl);

  fc->ib = fc->bufi;

  fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);


  /* Try harder this time to convert some */

  if (fc->ibl)

  {

    size_t obl = sizeof(fc->bufo);

    mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,

                  fc->inrepls, 0, NULL);

    if (fc->p < fc->ob)

      return (unsigned char) *(fc->p)++;

  }


  /* Either the file has finished or one of the buffers is too small */

  fc->p = 0;

  return EOF;

}


char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)

{

  if (!buf)

    return NULL;


  size_t r;

  for (r = 0; (r + 1) < buflen;)

  {

    const int c = mutt_ch_fgetconv(fc);

    if (c == EOF)

      break;

    buf[r++] = (char) c;

    if (c == '\n')

      break;

  }

  buf[r] = '\0';


  if (r > 0)

    return buf;


  return NULL;

}


void mutt_ch_set_charset(const char *charset)

{

  char buf[256] = { 0 };


  mutt_ch_canonical_charset(buf, sizeof(buf), charset);


  if (mutt_ch_is_utf8(buf))

  {

    CharsetIsUtf8 = true;

    ReplacementChar = 0xfffd; /* replacement character */

  }

  else

  {

    CharsetIsUtf8 = false;

    ReplacementChar = '?';

  }


#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)

  bind_textdomain_codeset(PACKAGE, buf);

#endif

}


char *mutt_ch_choose(const char *fromcode, const struct Slist *charsets,

                     const char *u, size_t ulen, char **d, size_t *dlen)

{

  if (!fromcode || !charsets)

    return NULL;


  char *e = NULL, *tocode = NULL;

  size_t elen = 0, bestn = 0;


  const struct ListNode *np = NULL;

  STAILQ_FOREACH(np, &charsets->head, entries)

  {

    char *t = mutt_str_dup(np->data);

    if (!t)

      continue;


    size_t n = mutt_str_len(t);

    char *s = mutt_strn_dup(u, ulen);

    const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :

                       mutt_ch_check(s, ulen, fromcode, t);

    if (rc)

    {

      FREE(&t);

      FREE(&s);

      continue;

    }

    size_t slen = mutt_str_len(s);


    if (!tocode || (n < bestn))

    {

      bestn = n;

      FREE(&tocode);

      tocode = t;

      if (d)

      {

        FREE(&e);

        e = s;

      }

      else

      {

        FREE(&s);

      }

      elen = slen;

    }

    else

    {

      FREE(&t);

      FREE(&s);

    }

  }

  if (tocode)

  {

    if (d)

      *d = e;

    if (dlen)

      *dlen = elen;


    char canonical_buf[1024] = { 0 };

    mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);

    mutt_str_replace(&tocode, canonical_buf);

  }

  return tocode;

}


void mutt_ch_cache_cleanup(void)

{

  for (int i = 0; i < IconvCacheUsed; i++)

  {

    FREE(&IconvCache[i].fromcode1);

    FREE(&IconvCache[i].tocode1);

    if (iconv_t_valid(IconvCache[i].cd))

    {

      iconv_close(IconvCache[i].cd);

    }

  }

  IconvCacheUsed = 0;

}

buf_addch
size_t buf_addch(struct Buffer *buf, char c)
Add a single character to a Buffer.
Definition: buffer.c:241

buf_addstr
size_t buf_addstr(struct Buffer *buf, const char *s)
Add a string to a Buffer.
Definition: buffer.c:226

buf_strcpy
size_t buf_strcpy(struct Buffer *buf, const char *s)
Copy a string into a Buffer.
Definition: buffer.c:395

buf_lower
void buf_lower(struct Buffer *buf)
Sets a buffer to lowercase.
Definition: buffer.c:734

buffer.h
General purpose object for storing and parsing strings.

buf_string
static const char * buf_string(const struct Buffer *buf)
Convert a buffer to a const char * "string".
Definition: buffer.h:96

mutt_debug
#define mutt_debug(LEVEL,...)
Definition: logging2.h:90

list.h
Singly-linked list type.

logging2.h
Logging Dispatcher.

LL_DEBUG2
@ LL_DEBUG2
Log at debug level 2.
Definition: logging2.h:45

memory.h
Memory management wrappers.

FREE
#define FREE(x)
Definition: memory.h:55

MIN
#define MIN(a, b)
Definition: memory.h:32

MUTT_MEM_CALLOC
#define MUTT_MEM_CALLOC(n, type)
Definition: memory.h:40

MUTT_MEM_MALLOC
#define MUTT_MEM_MALLOC(n, type)
Definition: memory.h:41

mutt_ch_check_charset
bool mutt_ch_check_charset(const char *cs, bool strict)
Does iconv understand a character set?
Definition: charset.c:894

mutt_ch_iconv
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:697

mutt_ch_lookup_remove
void mutt_ch_lookup_remove(void)
Remove all the character set lookups.
Definition: charset.c:541

IconvCacheUsed
static int IconvCacheUsed
Number of iconv descriptors in the cache.
Definition: charset.c:100

mutt_ch_choose
char * mutt_ch_choose(const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
Figure the best charset to encode a string.
Definition: charset.c:1108

mutt_ch_convert_nonmime_string
int mutt_ch_convert_nonmime_string(const struct Slist *const assumed_charset, const char *charset, char **ps)
Try to convert a string using a list of character sets.
Definition: charset.c:331

Lookups
static struct LookupList Lookups
Lookup table of preferred character set names.
Definition: charset.c:83

mutt_ch_get_langinfo_charset
char * mutt_ch_get_langinfo_charset(void)
Get the user's choice of character set.
Definition: charset.c:486

PreferredMimeNames
static const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:121

mutt_ch_lookup_add
bool mutt_ch_lookup_add(enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
Add a new character set lookup.
Definition: charset.c:509

mutt_ch_canonical_charset
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:374

mutt_ch_cache_cleanup
void mutt_ch_cache_cleanup(void)
Clean up the cached iconv handles and charset strings.
Definition: charset.c:1175

mutt_ch_iconv_lookup
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:781

mutt_ch_convert_string
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:831

mutt_ch_set_charset
void mutt_ch_set_charset(const char *charset)
Update the records for a new character set.
Definition: charset.c:1075

CharsetIsUtf8
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:66

lookup_charset
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:303

mutt_ch_check
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:796

mutt_ch_charset_lookup
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:562

lookup_new
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:269

mutt_ch_fgetconv
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:980

ICONV_CACHE_SIZE
#define ICONV_CACHE_SIZE
Max size of the iconv cache.
Definition: charset.c:96

lookup_free
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:278

ReplacementChar
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:61

EILSEQ
#define EILSEQ
Definition: charset.c:55

mutt_ch_fgetconv_open
struct FgetConv * mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
Prepare a file for charset conversion.
Definition: charset.c:933

IconvCache
static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
Cache of iconv conversion descriptors.
Definition: charset.c:98

mutt_ch_fgetconvs
char * mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
Convert a file's charset into a string buffer.
Definition: charset.c:1042

mutt_ch_chscmp
bool mutt_ch_chscmp(const char *cs1, const char *cs2)
Are the names of two character sets equivalent?
Definition: charset.c:442

mutt_ch_fgetconv_close
void mutt_ch_fgetconv_close(struct FgetConv **ptr)
Close an fgetconv handle.
Definition: charset.c:962

mutt_ch_iconv_open
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:594

mutt_ch_get_default_charset
const char * mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
Get the default character set.
Definition: charset.c:465

charset.h
Conversion between different character encodings.

MUTT_ICONV_HOOK_FROM
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:65

ICONV_T_INVALID
#define ICONV_T_INVALID
Error value for iconv functions.
Definition: charset.h:93

mutt_ch_is_utf8
#define mutt_ch_is_utf8(str)
Definition: charset.h:89

LookupType
LookupType
Types of character set lookups.
Definition: charset.h:59

MUTT_LOOKUP_ICONV
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:61

MUTT_LOOKUP_CHARSET
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:60

MUTT_ICONV_NO_FLAGS
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:64

ICONV_ILLEGAL_SEQ
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition: charset.h:96

iconv_t_valid
static bool iconv_t_valid(const iconv_t cd)
Is the conversion descriptor valid?
Definition: charset.h:105

mutt_regex_match
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:614

mutt_strn_dup
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:381

mutt_istr_equal
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:673

mutt_str_dup
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:254

mutt_str_adjust
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:300

mutt_str_len
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:497

mutt_str_copy
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:582

mutt_istr_startswith
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:243

mutt_istrn_equal
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:454

mutt_str_replace
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:281

buf_pool_get
struct Buffer * buf_pool_get(void)
Get a Buffer from the pool.
Definition: pool.c:82

buf_pool_release
void buf_pool_release(struct Buffer **ptr)
Return a Buffer to the pool.
Definition: pool.c:96

pool.h
A global pool of Buffers.

queue.h

TAILQ_FOREACH
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:782

TAILQ_FOREACH_SAFE
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:792

STAILQ_FIRST
#define STAILQ_FIRST(head)
Definition: queue.h:388

TAILQ_HEAD
#define TAILQ_HEAD(name, type)
Definition: queue.h:680

TAILQ_INSERT_TAIL
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:866

STAILQ_FOREACH
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:390

TAILQ_REMOVE
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:901

TAILQ_HEAD_INITIALIZER
#define TAILQ_HEAD_INITIALIZER(head)
Definition: queue.h:694

TAILQ_ENTRY
#define TAILQ_ENTRY(type)
Definition: queue.h:697

regex3.h
Manage regular expressions.

REG_COMP
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:50

slist.h
A separated list of strings.

string2.h
String manipulation functions.

Buffer
String manipulation buffer.
Definition: buffer.h:36

Buffer::dsize
size_t dsize
Length of data.
Definition: buffer.h:39

Buffer::data
char * data
Pointer to data.
Definition: buffer.h:37

FgetConv
Cursor for converting a file's encoding.
Definition: charset.h:43

FgetConv::bufi
char bufi[512]
Definition: charset.h:46

FgetConv::cd
iconv_t cd
iconv conversion descriptor
Definition: charset.h:45

FgetConv::bufo
char bufo[512]
Definition: charset.h:47

FgetConv::ibl
size_t ibl
Definition: charset.h:51

FgetConv::fp
FILE * fp
Definition: charset.h:44

FgetConv::p
char * p
Definition: charset.h:48

FgetConv::inrepls
const char ** inrepls
Definition: charset.h:52

FgetConv::ib
char * ib
Definition: charset.h:50

FgetConv::ob
char * ob
Definition: charset.h:49

IconvCacheEntry
Cached iconv conversion descriptor.
Definition: charset.c:89

IconvCacheEntry::tocode1
char * tocode1
Destination character set.
Definition: charset.c:91

IconvCacheEntry::fromcode1
char * fromcode1
Source character set.
Definition: charset.c:90

IconvCacheEntry::cd
iconv_t cd
iconv conversion descriptor
Definition: charset.c:92

ListNode
A List node for strings.
Definition: list.h:37

ListNode::data
char * data
String.
Definition: list.h:38

Lookup
Regex to String lookup table.
Definition: charset.c:74

Lookup::replacement
char * replacement
Alternative charset to use.
Definition: charset.c:77

Lookup::type
enum LookupType type
Lookup type.
Definition: charset.c:75

Lookup::regex
struct Regex regex
Regular expression.
Definition: charset.c:76

MimeNames
MIME name lookup entry.
Definition: charset.c:106

MimeNames::key
const char * key
Definition: charset.c:107

MimeNames::pref
const char * pref
Definition: charset.c:108

Regex
Cached regular expression.
Definition: regex3.h:86

Regex::pattern
char * pattern
printable version
Definition: regex3.h:87

Regex::pat_not
bool pat_not
do not match
Definition: regex3.h:89

Regex::regex
regex_t * regex
compiled expression
Definition: regex3.h:88

Slist
String list.
Definition: slist.h:37

Slist::head
struct ListHead head
List containing values.
Definition: slist.h:38

Slist::count
size_t count
Number of values in list.
Definition: slist.h:39