Conversion between different character encodings. More...

#include <iconv.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>

Include dependency graph for charset.h:

This graph shows which files directly or indirectly include this file:

Data Structures
struct	FgetConv
	Cursor for converting a file's encoding. More...

Macros
#define	MUTT_ICONV_NO_FLAGS 0
	No flags are set.

#define	MUTT_ICONV_HOOK_FROM 1
	apply charset-hooks to fromcode

#define	mutt_ch_is_utf8(str) mutt_ch_chscmp(str, "utf-8")

#define	mutt_ch_is_us_ascii(str) mutt_ch_chscmp(str, "us-ascii")

#define	ICONV_T_INVALID ((iconv_t) -1)
	Error value for iconv functions.

#define	ICONV_ILLEGAL_SEQ ((size_t) -1)
	Error value for iconv() - Illegal sequence.

#define	ICONV_BUF_TOO_SMALL ((size_t) -2)
	Error value for iconv() - Buffer too small.

Enumerations
enum	LookupType { MUTT_LOOKUP_CHARSET , MUTT_LOOKUP_ICONV }
	Types of character set lookups. More...

Functions
void	mutt_ch_canonical_charset (char buf, size_t buflen, const char name)
	Canonicalise the charset of a string.

const char *	mutt_ch_charset_lookup (const char *chs)
	Look for a replacement character set.

int	mutt_ch_check (const char s, size_t slen, const char from, const char *to)
	Check whether a string can be converted between encodings.

bool	mutt_ch_check_charset (const char *cs, bool strict)
	Does iconv understand a character set?

char *	mutt_ch_choose (const char fromcode, const struct Slist charsets, const char u, size_t ulen, char d, size_t dlen)
	Figure the best charset to encode a string.

bool	mutt_ch_chscmp (const char cs1, const char cs2)
	Are the names of two character sets equivalent?

int	mutt_ch_convert_nonmime_string (const struct Slist const assumed_charset, const char charset, char **ps)
	Try to convert a string using a list of character sets.

int	mutt_ch_convert_string (char *ps, const char from, const char *to, uint8_t flags)
	Convert a string between encodings.

int	mutt_ch_fgetconv (struct FgetConv *fc)
	Convert a file's character set.

void	mutt_ch_fgetconv_close (struct FgetConv **ptr)
	Close an fgetconv handle.

struct FgetConv *	mutt_ch_fgetconv_open (FILE fp, const char from, const char *to, uint8_t flags)
	Prepare a file for charset conversion.

char *	mutt_ch_fgetconvs (char buf, size_t buflen, struct FgetConv fc)
	Convert a file's charset into a string buffer.

const char *	mutt_ch_get_default_charset (const struct Slist *const assumed_charset)
	Get the default character set.

char *	mutt_ch_get_langinfo_charset (void)
	Get the user's choice of character set.

size_t	mutt_ch_iconv (iconv_t cd, const char *inbuf, size_t inbytesleft, char *outbuf, size_t outbytesleft, const char *inrepls, const char outrepl, int *iconverrno)
	Change the encoding of a string.

const char *	mutt_ch_iconv_lookup (const char *chs)
	Look for a replacement character set.

iconv_t	mutt_ch_iconv_open (const char tocode, const char fromcode, uint8_t flags)
	Set up iconv for conversions.

bool	mutt_ch_lookup_add (enum LookupType type, const char pat, const char replace, struct Buffer *err)
	Add a new character set lookup.

void	mutt_ch_lookup_remove (void)
	Remove all the character set lookups.

void	mutt_ch_set_charset (const char *charset)
	Update the records for a new character set.

void	mutt_ch_cache_cleanup (void)
	Clean up the cached iconv handles and charset strings.

static bool	iconv_t_valid (const iconv_t cd)
	Is the conversion descriptor valid?

Variables
bool	CharsetIsUtf8
	Is the user's current character set utf-8?

wchar_t	ReplacementChar
	When a Unicode character can't be displayed, use this instead.

Detailed Description

Conversion between different character encodings.

Authors

Richard Russon
Pietro Cerutti

Copyright: This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.h.

Macro Definition Documentation

◆ MUTT_ICONV_NO_FLAGS

#define MUTT_ICONV_NO_FLAGS 0

No flags are set.

Definition at line 64 of file charset.h.

◆ MUTT_ICONV_HOOK_FROM

#define MUTT_ICONV_HOOK_FROM 1

apply charset-hooks to fromcode

Definition at line 65 of file charset.h.

◆ mutt_ch_is_utf8

#define mutt_ch_is_utf8 ( str ) mutt_ch_chscmp(str, "utf-8")

Definition at line 89 of file charset.h.

◆ mutt_ch_is_us_ascii

#define mutt_ch_is_us_ascii ( str ) mutt_ch_chscmp(str, "us-ascii")

Definition at line 90 of file charset.h.

◆ ICONV_T_INVALID

#define ICONV_T_INVALID ((iconv_t) -1)

Error value for iconv functions.

Definition at line 93 of file charset.h.

◆ ICONV_ILLEGAL_SEQ

#define ICONV_ILLEGAL_SEQ ((size_t) -1)

Error value for iconv() - Illegal sequence.

Definition at line 96 of file charset.h.

◆ ICONV_BUF_TOO_SMALL

#define ICONV_BUF_TOO_SMALL ((size_t) -2)

Error value for iconv() - Buffer too small.

Definition at line 98 of file charset.h.

Enumeration Type Documentation

◆ LookupType

enum LookupType

Types of character set lookups.

Enumerator
MUTT_LOOKUP_CHARSET	Alias for another character set.
MUTT_LOOKUP_ICONV	Character set conversion.

Definition at line 58 of file charset.h.

{
  MUTT_LOOKUP_CHARSET, 
  MUTT_LOOKUP_ICONV,   
};

Function Documentation

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset	(	char *	buf,
		size_t	buflen,
		const char *	name
	)

Canonicalise the charset of a string.

Parameters

buf	Buffer for canonical character set name
buflen	Length of buffer
name	Name to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 374 of file charset.c.

{
  if (!buf || !name)
    return;
 
  char in[1024] = { 0 };
  char scratch[1024 + 10] = { 0 };
  struct Buffer *canon = buf_pool_get();
 
  mutt_str_copy(in, name, sizeof(in));
  char *ext = strchr(in, '/');
  if (ext)
    *ext++ = '\0';
 
  if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
  {
    buf_strcpy(canon, "utf-8");
    goto out;
  }
 
  /* catch some common iso-8859-something misspellings */
  size_t plen;
  if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
    snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
  else if ((plen = mutt_istr_startswith(in, "8859-")))
    snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
  else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
    snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
  else if ((plen = mutt_istr_startswith(in, "iso8859-")))
    snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
  else
    mutt_str_copy(scratch, in, sizeof(scratch));
 
  for (size_t i = 0; PreferredMimeNames[i].key; i++)
  {
    if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
    {
      buf_strcpy(canon, PreferredMimeNames[i].pref);
      goto out;
    }
  }
 
  buf_strcpy(canon, scratch);
  buf_lower(canon); // for cosmetics' sake
 
out:
  if (ext && (*ext != '\0'))
  {
    buf_addch(canon, '/');
    buf_addstr(canon, ext);
  }
 
  mutt_str_copy(buf, buf_string(canon), buflen);
  buf_pool_release(&canon);
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char * mutt_ch_charset_lookup ( const char * chs )

Look for a replacement character set.

Parameters

chs	Character set to lookup

Return values

ptr	Replacement character set (if a 'charset-hook' matches)
NULL	No matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 562 of file charset.c.

{
  return lookup_charset(MUTT_LOOKUP_CHARSET, chs);
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check	(	const char *	s,
		size_t	slen,
		const char *	from,
		const char *	to
	)

Check whether a string can be converted between encodings.

Parameters

[in]	s	String to check
[in]	slen	Length of the string to check
[in]	from	Current character set
[in]	to	Target character set

Return values

0	Success
-1	Error in iconv_open()
>0	Errno as set by iconv()

Definition at line 796 of file charset.c.

{
  if (!s || !from || !to)
    return -1;
 
  int rc = 0;
  iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
  if (!iconv_t_valid(cd))
    return -1;
 
  size_t outlen = MB_LEN_MAX * slen;
  char *out = MUTT_MEM_MALLOC(outlen + 1, char);
  char *saved_out = out;
 
  const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
  if (convlen == ICONV_ILLEGAL_SEQ)
    rc = errno;
 
  FREE(&saved_out);
  return rc;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset	(	const char *	cs,
		bool	strict
	)

Does iconv understand a character set?

Parameters

cs	Character set to check
strict	Check strictly by using iconv

Return values

true	Character set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 894 of file charset.c.

{
  if (!cs)
    return false;
 
  if (mutt_ch_is_utf8(cs))
    return true;
 
  if (!strict)
  {
    for (int i = 0; PreferredMimeNames[i].key; i++)
    {
      if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
          mutt_istr_equal(PreferredMimeNames[i].pref, cs))
      {
        return true;
      }
    }
  }
 
  iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
  if (iconv_t_valid(cd))
  {
    return true;
  }
 
  return false;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_choose()

char * mutt_ch_choose	(	const char *	fromcode,
		const struct Slist *	charsets,
		const char *	u,
		size_t	ulen,
		char **	d,
		size_t *	dlen
	)

Figure the best charset to encode a string.

Parameters

[in]	fromcode	Original charset of the string
[in]	charsets	List of potential charsets to use
[in]	u	String to encode
[in]	ulen	Length of the string to encode
[out]	d	If not NULL, point it to the converted string
[out]	dlen	If not NULL, point it to the length of the d string

Return values

ptr	Best performing charset
NULL	None could be found

Definition at line 1108 of file charset.c.

{
  if (!fromcode || !charsets)
    return NULL;
 
  char *e = NULL, *tocode = NULL;
  size_t elen = 0, bestn = 0;
 
  const struct ListNode *np = NULL;
  STAILQ_FOREACH(np, &charsets->head, entries)
  {
    char *t = mutt_str_dup(np->data);
    if (!t)
      continue;
 
    size_t n = mutt_str_len(t);
    char *s = mutt_strn_dup(u, ulen);
    const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
                       mutt_ch_check(s, ulen, fromcode, t);
    if (rc)
    {
      FREE(&t);
      FREE(&s);
      continue;
    }
    size_t slen = mutt_str_len(s);
 
    if (!tocode || (n < bestn))
    {
      bestn = n;
      FREE(&tocode);
      tocode = t;
      if (d)
      {
        FREE(&e);
        e = s;
      }
      else
      {
        FREE(&s);
      }
      elen = slen;
    }
    else
    {
      FREE(&t);
      FREE(&s);
    }
  }
  if (tocode)
  {
    if (d)
      *d = e;
    if (dlen)
      *dlen = elen;
 
    char canonical_buf[1024] = { 0 };
    mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
    mutt_str_replace(&tocode, canonical_buf);
  }
  return tocode;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp	(	const char *	cs1,
		const char *	cs2
	)

Are the names of two character sets equivalent?

Parameters

cs1	First character set
cs2	Second character set

Return values

true	Names are equivalent
false	Names differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 442 of file charset.c.

{
  if (!cs1 || !cs2)
    return false;
 
  char buf[256] = { 0 };
 
  mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
 
  int len1 = mutt_str_len(buf);
  int len2 = mutt_str_len(cs2);
 
  return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
                          ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
}

Here is the call graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string	(	const struct Slist *const	assumed_charset,
		const char *	charset,
		char **	ps
	)

Try to convert a string using a list of character sets.

Parameters

[in]	assumed_charset	From $assumed_charset
[in]	charset	From $charset
[in,out]	ps	String to be converted

Return values

0	Success
-1	Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 331 of file charset.c.

{
  if (!ps)
    return -1;
 
  char *u = *ps;
  const size_t ulen = mutt_str_len(u);
  if (ulen == 0)
    return 0;
 
  const struct ListNode *np = NULL;
  STAILQ_FOREACH(np, &assumed_charset->head, entries)
  {
    char const *c = np->data;
    size_t n = mutt_str_len(c);
    char *fromcode = MUTT_MEM_MALLOC(n + 1, char);
    mutt_str_copy(fromcode, c, n + 1);
    char *s = mutt_strn_dup(u, ulen);
    int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);
    FREE(&fromcode);
    if (m == 0)
    {
      FREE(ps);
      *ps = s;
      return 0;
    }
    FREE(&s);
  }
  mutt_ch_convert_string(ps, mutt_ch_get_default_charset(assumed_charset),
                         charset, MUTT_ICONV_HOOK_FROM);
  return -1;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string	(	char **	ps,
		const char *	from,
		const char *	to,
		uint8_t	flags
	)

Convert a string between encodings.

Parameters

[in,out]	ps	String to convert
[in]	from	Current character set
[in]	to	Target character set
[in]	flags	Flags, e.g. MUTT_ICONV_HOOK_FROM

Return values

0	Success
-1	Invalid arguments or failure to open an iconv channel
errno	Failure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 831 of file charset.c.

{
  if (!ps)
    return -1;
 
  char *s = *ps;
 
  if (!s || (*s == '\0'))
    return 0;
 
  if (!to || !from)
    return -1;
 
  const char *repls[] = { "\357\277\275", "?", 0 };
  int rc = 0;
 
  iconv_t cd = mutt_ch_iconv_open(to, from, flags);
  if (!iconv_t_valid(cd))
    return -1;
 
  const char **inrepls = NULL;
  const char *outrepl = NULL;
 
  if (mutt_ch_is_utf8(to))
    outrepl = "\357\277\275";
  else if (mutt_ch_is_utf8(from))
    inrepls = repls;
  else
    outrepl = "?";
 
  const char *ib = s;
  size_t ibl = strlen(s);
  if (ibl >= (SIZE_MAX / MB_LEN_MAX))
  {
    return -1;
  }
  size_t obl = MB_LEN_MAX * ibl;
  char *buf = MUTT_MEM_MALLOC(obl + 1, char);
  char *ob = buf;
 
  mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
  iconv(cd, 0, 0, &ob, &obl);
 
  *ob = '\0';
 
  FREE(ps);
  *ps = buf;
 
  mutt_str_adjust(ps);
  return rc;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv * fc )

Convert a file's character set.

Parameters

fc	FgetConv handle

Return values

num	Next character in the converted file
EOF	Error

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 980 of file charset.c.

{
  if (!fc)
    return EOF;
  if (!iconv_t_valid(fc->cd))
    return fgetc(fc->fp);
  if (!fc->p)
    return EOF;
  if (fc->p < fc->ob)
    return (unsigned char) *(fc->p)++;
 
  /* Try to convert some more */
  fc->p = fc->bufo;
  fc->ob = fc->bufo;
  if (fc->ibl)
  {
    size_t obl = sizeof(fc->bufo);
    iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
    if (fc->p < fc->ob)
      return (unsigned char) *(fc->p)++;
  }
 
  /* If we trusted iconv a bit more, we would at this point
   * ask why it had stopped converting ... */
 
  /* Try to read some more */
  if ((fc->ibl == sizeof(fc->bufi)) ||
      (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
  {
    fc->p = 0;
    return EOF;
  }
  if (fc->ibl)
    memcpy(fc->bufi, fc->ib, fc->ibl);
  fc->ib = fc->bufi;
  fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
 
  /* Try harder this time to convert some */
  if (fc->ibl)
  {
    size_t obl = sizeof(fc->bufo);
    mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
                  fc->inrepls, 0, NULL);
    if (fc->p < fc->ob)
      return (unsigned char) *(fc->p)++;
  }
 
  /* Either the file has finished or one of the buffers is too small */
  fc->p = 0;
  return EOF;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv ** ptr )

Close an fgetconv handle.

Parameters

[out] ptr fgetconv handle

Definition at line 962 of file charset.c.

{
  if (!ptr || !*ptr)
    return;
 
  FREE(ptr);
}

Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv * mutt_ch_fgetconv_open	(	FILE *	fp,
		const char *	from,
		const char *	to,
		uint8_t	flags
	)

Prepare a file for charset conversion.

Parameters

fp	FILE ptr to prepare
from	Current character set
to	Destination character set
flags	Flags, e.g. MUTT_ICONV_HOOK_FROM

Return values

ptr	fgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 933 of file charset.c.

{
  iconv_t cd = ICONV_T_INVALID;
 
  if (from && to)
    cd = mutt_ch_iconv_open(to, from, flags);
 
  struct FgetConv *fc = MUTT_MEM_CALLOC(1, struct FgetConv);
  fc->fp = fp;
  fc->cd = cd;
 
  if (iconv_t_valid(cd))
  {
    static const char *repls[] = { "\357\277\275", "?", 0 };
 
    fc->p = fc->bufo;
    fc->ob = fc->bufo;
    fc->ib = fc->bufi;
    fc->ibl = 0;
    fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
  }
 
  return fc;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char * mutt_ch_fgetconvs	(	char *	buf,
		size_t	buflen,
		struct FgetConv *	fc
	)

Convert a file's charset into a string buffer.

Parameters

buf	Buffer for result
buflen	Length of buffer
fc	FgetConv handle

Return values

ptr	Success, result buffer
NULL	Error

Read a file into a buffer, converting the character set as it goes.

Definition at line 1042 of file charset.c.

{
  if (!buf)
    return NULL;
 
  size_t r;
  for (r = 0; (r + 1) < buflen;)
  {
    const int c = mutt_ch_fgetconv(fc);
    if (c == EOF)
      break;
    buf[r++] = (char) c;
    if (c == '\n')
      break;
  }
  buf[r] = '\0';
 
  if (r > 0)
    return buf;
 
  return NULL;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_get_default_charset()

const char * mutt_ch_get_default_charset ( const struct Slist *const assumed_charset )

Get the default character set.

Parameters

assumed_charset From $assumed_charset

Return values

ptr	Name of the default character set

Warning: This returns a pointer to a static buffer. Do not free it.

Definition at line 465 of file charset.c.

{
  static char fcharset[128];
  const char *c = NULL;
 
  if (assumed_charset && (assumed_charset->count > 0))
    c = STAILQ_FIRST(&assumed_charset->head)->data;
  else
    c = "us-ascii";
 
  mutt_str_copy(fcharset, c, sizeof(fcharset));
  return fcharset;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char * mutt_ch_get_langinfo_charset ( void )

Get the user's choice of character set.

Return values

ptr	Charset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 486 of file charset.c.

{
  char buf[1024] = { 0 };
 
  mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
 
  if (buf[0] != '\0')
    return mutt_str_dup(buf);
 
  return mutt_str_dup("iso-8859-1");
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv	(	iconv_t	cd,
		const char **	inbuf,
		size_t *	inbytesleft,
		char **	outbuf,
		size_t *	outbytesleft,
		const char **	inrepls,
		const char *	outrepl,
		int *	iconverrno
	)

Change the encoding of a string.

Parameters

[in]	cd	Iconv conversion descriptor
[in,out]	inbuf	Buffer to convert
[in,out]	inbytesleft	Length of buffer to convert
[in,out]	outbuf	Buffer for the result
[in,out]	outbytesleft	Length of result buffer
[in]	inrepls	Input replacement characters
[in]	outrepl	Output replacement characters
[out]	iconverrno	Errno if iconv() fails, 0 if it succeeds

Return values

num	Characters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 697 of file charset.c.

{
  size_t rc = 0;
  const char *ib = *inbuf;
  size_t ibl = *inbytesleft;
  char *ob = *outbuf;
  size_t obl = *outbytesleft;
 
  while (true)
  {
    errno = 0;
    const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
    if (ret1 != ICONV_ILLEGAL_SEQ)
      rc += ret1;
    if (iconverrno)
      *iconverrno = errno;
 
    if (ibl && obl && (errno == EILSEQ))
    {
      if (inrepls)
      {
        /* Try replacing the input */
        const char **t = NULL;
        for (t = inrepls; *t; t++)
        {
          const char *ib1 = *t;
          size_t ibl1 = strlen(*t);
          char *ob1 = ob;
          size_t obl1 = obl;
          iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
          if (ibl1 == 0)
          {
            ib++;
            ibl--;
            ob = ob1;
            obl = obl1;
            rc++;
            break;
          }
        }
        if (*t)
          continue;
      }
      /* Replace the output */
      if (!outrepl)
        outrepl = "?";
      iconv(cd, NULL, NULL, &ob, &obl);
      if (obl)
      {
        int n = strlen(outrepl);
        if (n > obl)
        {
          outrepl = "?";
          n = 1;
        }
        memcpy(ob, outrepl, n);
        ib++;
        ibl--;
        ob += n;
        obl -= n;
        rc++;
        iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
        continue;
      }
    }
    *inbuf = ib;
    *inbytesleft = ibl;
    *outbuf = ob;
    *outbytesleft = obl;
    return rc;
  }
}

Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char * mutt_ch_iconv_lookup ( const char * chs )

Look for a replacement character set.

Parameters

chs	Character set to lookup

Return values

ptr	Replacement character set (if a 'iconv-hook' matches)
NULL	No matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 781 of file charset.c.

{
  return lookup_charset(MUTT_LOOKUP_ICONV, chs);
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open	(	const char *	tocode,
		const char *	fromcode,
		uint8_t	flags
	)

Set up iconv for conversions.

Parameters

tocode	Current character set
fromcode	Target character set
flags	Flags, e.g. MUTT_ICONV_HOOK_FROM

Return values

ptr	iconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Since calling iconv_open() repeatedly can be expensive, we keep a cache of the most recently used iconv_t objects, kept in LRU order. This means that you should not call iconv_close() on the object yourself. All remaining objects in the cache will exit when main() calls mutt_ch_cache_cleanup().

Note: By design charset-hooks should never be, and are never, applied to tocode.; The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 594 of file charset.c.

{
  char tocode1[128] = { 0 };
  char fromcode1[128] = { 0 };
  const char *tocode2 = NULL, *fromcode2 = NULL;
  const char *tmp = NULL;
 
  /* transform to MIME preferred charset names */
  mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
 
  /* maybe apply charset-hooks and recanonicalise fromcode,
   * but only when caller asked us to sanitize a potentially wrong
   * charset name incoming from the wild exterior. */
  if (flags & MUTT_ICONV_HOOK_FROM)
  {
    tmp = mutt_ch_charset_lookup(fromcode1);
    if (tmp)
      mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
  }
 
  /* check if we have this pair cached already */
  for (int i = 0; i < IconvCacheUsed; i++)
  {
    if (strcmp(tocode1, IconvCache[i].tocode1) == 0 &&
        strcmp(fromcode1, IconvCache[i].fromcode1) == 0)
    {
      iconv_t cd = IconvCache[i].cd;
 
      /* make room for this one at the top */
      struct IconvCacheEntry top = IconvCache[i];
      for (int j = i - 1; j >= 0; j--)
      {
        IconvCache[j + 1] = IconvCache[j];
      }
      IconvCache[0] = top;
 
      if (iconv_t_valid(cd))
      {
        /* reset state */
        iconv(cd, NULL, NULL, NULL, NULL);
      }
      return cd;
    }
  }
 
  /* not found in cache */
  /* always apply iconv-hooks to suit system's iconv tastes */
  tocode2 = mutt_ch_iconv_lookup(tocode1);
  tocode2 = tocode2 ? tocode2 : tocode1;
  fromcode2 = mutt_ch_iconv_lookup(fromcode1);
  fromcode2 = fromcode2 ? fromcode2 : fromcode1;
 
  /* call system iconv with names it appreciates */
  iconv_t cd = iconv_open(tocode2, fromcode2);
 
  if (IconvCacheUsed == ICONV_CACHE_SIZE)
  {
    mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n",
               IconvCache[IconvCacheUsed - 1].fromcode1,
               IconvCache[IconvCacheUsed - 1].tocode1);
    /* get rid of the oldest entry */
    FREE(&IconvCache[IconvCacheUsed - 1].fromcode1);
    FREE(&IconvCache[IconvCacheUsed - 1].tocode1);
    if (iconv_t_valid(IconvCache[IconvCacheUsed - 1].cd))
    {
      iconv_close(IconvCache[IconvCacheUsed - 1].cd);
    }
    IconvCacheUsed--;
  }
 
  /* make room for this one at the top */
  for (int j = IconvCacheUsed - 1; j >= 0; j--)
  {
    IconvCache[j + 1] = IconvCache[j];
  }
 
  IconvCacheUsed++;
 
  mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1);
  IconvCache[0].fromcode1 = strdup(fromcode1);
  IconvCache[0].tocode1 = strdup(tocode1);
  IconvCache[0].cd = cd;
 
  return cd;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add	(	enum LookupType	type,
		const char *	pat,
		const char *	replace,
		struct Buffer *	err
	)

Add a new character set lookup.

Parameters

type	Type of character set, e.g. MUTT_LOOKUP_CHARSET
pat	Pattern to match
replace	Replacement string
err	Buffer for error message

Return values

true	Lookup added to list
false	Regex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 509 of file charset.c.

{
  if (!pat || !replace)
    return false;
 
  regex_t *rx = MUTT_MEM_CALLOC(1, regex_t);
  int rc = REG_COMP(rx, pat, REG_ICASE);
  if (rc != 0)
  {
    regerror(rc, rx, err->data, err->dsize);
    FREE(&rx);
    return false;
  }
 
  struct Lookup *l = lookup_new();
  l->type = type;
  l->replacement = mutt_str_dup(replace);
  l->regex.pattern = mutt_str_dup(pat);
  l->regex.regex = rx;
  l->regex.pat_not = false;
 
  TAILQ_INSERT_TAIL(&Lookups, l, entries);
 
  return true;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 541 of file charset.c.

{
  struct Lookup *l = NULL;
  struct Lookup *tmp = NULL;
 
  TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
  {
    TAILQ_REMOVE(&Lookups, l, entries);
    lookup_free(&l);
  }
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char * charset )

Update the records for a new character set.

Parameters

charset New character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note: This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1075 of file charset.c.

{
  char buf[256] = { 0 };
 
  mutt_ch_canonical_charset(buf, sizeof(buf), charset);
 
  if (mutt_ch_is_utf8(buf))
  {
    CharsetIsUtf8 = true;
    ReplacementChar = 0xfffd; /* replacement character */
  }
  else
  {
    CharsetIsUtf8 = false;
    ReplacementChar = '?';
  }
 
#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
  bind_textdomain_codeset(PACKAGE, buf);
#endif
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_ch_cache_cleanup()

void mutt_ch_cache_cleanup ( void )

Clean up the cached iconv handles and charset strings.

Definition at line 1175 of file charset.c.

{
  for (int i = 0; i < IconvCacheUsed; i++)
  {
    FREE(&IconvCache[i].fromcode1);
    FREE(&IconvCache[i].tocode1);
    if (iconv_t_valid(IconvCache[i].cd))
    {
      iconv_close(IconvCache[i].cd);
    }
  }
  IconvCacheUsed = 0;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ iconv_t_valid()

static bool iconv_t_valid ( const iconv_t cd )

inlinestatic

Is the conversion descriptor valid?

Parameters

cd	Conversion descriptor to test

Return values

true	It's valid

Definition at line 105 of file charset.h.

{
  return cd != ICONV_T_INVALID;
}

Here is the caller graph for this function:

Variable Documentation

◆ CharsetIsUtf8

bool CharsetIsUtf8

extern

Is the user's current character set utf-8?

Definition at line 66 of file charset.c.

◆ ReplacementChar

wchar_t ReplacementChar

extern

When a Unicode character can't be displayed, use this instead.

Definition at line 61 of file charset.c.

Data Structures

Macros

Enumerations

Functions

Variables

Detailed Description

Macro Definition Documentation

◆ MUTT_ICONV_NO_FLAGS

◆ MUTT_ICONV_HOOK_FROM

◆ mutt_ch_is_utf8

◆ mutt_ch_is_us_ascii

◆ ICONV_T_INVALID

◆ ICONV_ILLEGAL_SEQ

◆ ICONV_BUF_TOO_SMALL

Enumeration Type Documentation

◆ LookupType

Function Documentation

◆ mutt_ch_canonical_charset()

◆ mutt_ch_charset_lookup()

◆ mutt_ch_check()

◆ mutt_ch_check_charset()

◆ mutt_ch_choose()

◆ mutt_ch_chscmp()

◆ mutt_ch_convert_nonmime_string()

◆ mutt_ch_convert_string()

◆ mutt_ch_fgetconv()

◆ mutt_ch_fgetconv_close()

◆ mutt_ch_fgetconv_open()

◆ mutt_ch_fgetconvs()

◆ mutt_ch_get_default_charset()

◆ mutt_ch_get_langinfo_charset()

◆ mutt_ch_iconv()

◆ mutt_ch_iconv_lookup()

◆ mutt_ch_iconv_open()

◆ mutt_ch_lookup_add()

◆ mutt_ch_lookup_remove()

◆ mutt_ch_set_charset()

◆ mutt_ch_cache_cleanup()

◆ iconv_t_valid()

Variable Documentation

◆ CharsetIsUtf8

◆ ReplacementChar