NeoMutt  2020-06-26-89-g172cd3
Teaching an old dog new tricks
DOXYGEN
charset.c File Reference

Conversion between different character encodings. More...

#include "config.h"
#include <ctype.h>
#include <errno.h>
#include <iconv.h>
#include <langinfo.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include "charset.h"
#include "buffer.h"
#include "memory.h"
#include "queue.h"
#include "regex3.h"
#include "string2.h"
#include <libintl.h>
+ Include dependency graph for charset.c:

Go to the source code of this file.

Data Structures

struct  Lookup
 Regex to String lookup table. More...
 
struct  MimeNames
 MIME name lookup entry. More...
 

Macros

#define EILSEQ   EINVAL
 

Functions

 TAILQ_HEAD (LookupList, Lookup)
 
static struct Lookuplookup_new (void)
 Create a new Lookup. More...
 
static void lookup_free (struct Lookup **ptr)
 Free a Lookup. More...
 
static const char * lookup_charset (enum LookupType type, const char *cs)
 Look for a preferred character set name. More...
 
int mutt_ch_convert_nonmime_string (char **ps)
 Try to convert a string using a list of character sets. More...
 
void mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name)
 Canonicalise the charset of a string. More...
 
bool mutt_ch_chscmp (const char *cs1, const char *cs2)
 Are the names of two character sets equivalent? More...
 
char * mutt_ch_get_default_charset (void)
 Get the default character set. More...
 
char * mutt_ch_get_langinfo_charset (void)
 Get the user's choice of character set. More...
 
bool mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
 Add a new character set lookup. More...
 
void mutt_ch_lookup_remove (void)
 Remove all the character set lookups. More...
 
const char * mutt_ch_charset_lookup (const char *chs)
 Look for a replacement character set. More...
 
iconv_t mutt_ch_iconv_open (const char *tocode, const char *fromcode, int flags)
 Set up iconv for conversions. More...
 
size_t mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
 Change the encoding of a string. More...
 
const char * mutt_ch_iconv_lookup (const char *chs)
 Look for a replacement character set. More...
 
int mutt_ch_check (const char *s, size_t slen, const char *from, const char *to)
 Check whether a string can be converted between encodings. More...
 
int mutt_ch_convert_string (char **ps, const char *from, const char *to, int flags)
 Convert a string between encodings. More...
 
bool mutt_ch_check_charset (const char *cs, bool strict)
 Does iconv understand a character set? More...
 
struct FgetConvmutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, int flags)
 Prepare a file for charset conversion. More...
 
void mutt_ch_fgetconv_close (struct FgetConv **fc)
 Close an fgetconv handle. More...
 
int mutt_ch_fgetconv (struct FgetConv *fc)
 Convert a file's character set. More...
 
char * mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc)
 Convert a file's charset into a string buffer. More...
 
void mutt_ch_set_charset (const char *charset)
 Update the records for a new character set. More...
 
char * mutt_ch_choose (const char *fromcode, const char *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
 Figure the best charset to encode a string. More...
 

Variables

char * C_AssumedCharset
 Config: If a message is missing a character set, assume this character set. More...
 
char * C_Charset
 Config: Default character set for displaying text on screen. More...
 
wchar_t ReplacementChar = '?'
 When a Unicode character can't be displayed, use this instead. More...
 
bool CharsetIsUtf8 = false
 Is the user's current character set utf-8? More...
 
static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
 
const struct MimeNames PreferredMimeNames []
 Lookup table of preferred charsets. More...
 

Detailed Description

Conversion between different character encodings.

Authors
  • Thomas Roessler

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.c.

Macro Definition Documentation

◆ EILSEQ

#define EILSEQ   EINVAL

Definition at line 49 of file charset.c.

Function Documentation

◆ TAILQ_HEAD()

TAILQ_HEAD ( LookupList  ,
Lookup   
)

◆ lookup_new()

static struct Lookup* lookup_new ( void  )
static

Create a new Lookup.

Return values
ptrNew Lookup

Definition at line 249 of file charset.c.

250 {
251  return mutt_mem_calloc(1, sizeof(struct Lookup));
252 }
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
Regex to String lookup table.
Definition: charset.c:70
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ lookup_free()

static void lookup_free ( struct Lookup **  ptr)
static

Free a Lookup.

Parameters
ptrLookup to free

Definition at line 258 of file charset.c.

259 {
260  if (!ptr || !*ptr)
261  return;
262 
263  struct Lookup *l = *ptr;
264  FREE(&l->replacement);
265  FREE(&l->regex.pattern);
266  if (l->regex.regex)
267  regfree(l->regex.regex);
268  FREE(&l->regex.regex);
269  FREE(&l->regex);
270 
271  FREE(ptr);
272 }
regex_t * regex
compiled expression
Definition: regex3.h:91
char * replacement
Alternative charset to use.
Definition: charset.c:74
struct Regex regex
Regular expression.
Definition: charset.c:73
Regex to String lookup table.
Definition: charset.c:70
#define FREE(x)
Definition: memory.h:40
char * pattern
printable version
Definition: regex3.h:90
+ Here is the caller graph for this function:

◆ lookup_charset()

static const char* lookup_charset ( enum LookupType  type,
const char *  cs 
)
static

Look for a preferred character set name.

Parameters
typeType, e.g. MUTT_LOOKUP_CHARSET
csCharacter set
Return values
ptrCharset string

If the character set matches one of the regexes, then return the replacement name.

Definition at line 283 of file charset.c.

284 {
285  if (!cs)
286  return NULL;
287 
288  struct Lookup *l = NULL;
289 
290  TAILQ_FOREACH(l, &Lookups, entries)
291  {
292  if (l->type != type)
293  continue;
294  if (mutt_regex_match(&l->regex, cs))
295  return l->replacement;
296  }
297  return NULL;
298 }
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:718
char * replacement
Alternative charset to use.
Definition: charset.c:74
struct Regex regex
Regular expression.
Definition: charset.c:73
Regex to String lookup table.
Definition: charset.c:70
static struct LookupList Lookups
Definition: charset.c:79
enum LookupType type
Lookup type.
Definition: charset.c:72
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:609
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string ( char **  ps)

Try to convert a string using a list of character sets.

Parameters
[in,out]psString to be converted
Return values
0Success
-1Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 309 of file charset.c.

310 {
311  if (!ps)
312  return -1;
313 
314  char *u = *ps;
315  const size_t ulen = mutt_str_len(u);
316  if (ulen == 0)
317  return 0;
318 
319  const char *c1 = NULL;
320 
321  for (const char *c = C_AssumedCharset; c; c = c1 ? c1 + 1 : 0)
322  {
323  c1 = strchr(c, ':');
324  size_t n = c1 ? c1 - c : mutt_str_len(c);
325  if (n == 0)
326  return 0;
327  char *fromcode = mutt_mem_malloc(n + 1);
328  mutt_str_copy(fromcode, c, n + 1);
329  char *s = mutt_strn_dup(u, ulen);
330  int m = mutt_ch_convert_string(&s, fromcode, C_Charset, 0);
331  FREE(&fromcode);
332  FREE(&s);
333  if (m == 0)
334  {
335  return 0;
336  }
337  }
340  return -1;
341 }
char * C_AssumedCharset
Config: If a message is missing a character set, assume this character set.
Definition: charset.c:52
int mutt_ch_convert_string(char **ps, const char *from, const char *to, int flags)
Convert a string between encodings.
Definition: charset.c:754
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:553
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:636
int n
Definition: acutest.h:492
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:721
#define FREE(x)
Definition: memory.h:40
char * mutt_ch_get_default_charset(void)
Get the default character set.
Definition: charset.c:440
char * C_Charset
Config: Default character set for displaying text on screen.
Definition: charset.c:53
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset ( char *  buf,
size_t  buflen,
const char *  name 
)

Canonicalise the charset of a string.

Parameters
bufBuffer for canonical character set name
buflenLength of buffer
nameName to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 352 of file charset.c.

353 {
354  if (!buf || !name)
355  return;
356 
357  char in[1024], scratch[1024];
358 
359  mutt_str_copy(in, name, sizeof(in));
360  char *ext = strchr(in, '/');
361  if (ext)
362  *ext++ = '\0';
363 
364  if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
365  {
366  mutt_str_copy(buf, "utf-8", buflen);
367  goto out;
368  }
369 
370  /* catch some common iso-8859-something misspellings */
371  size_t plen;
372  if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
373  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
374  else if ((plen = mutt_istr_startswith(in, "8859-")))
375  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
376  else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
377  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
378  else if ((plen = mutt_istr_startswith(in, "iso8859-")))
379  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
380  else
381  mutt_str_copy(scratch, in, sizeof(scratch));
382 
383  for (size_t i = 0; PreferredMimeNames[i].key; i++)
384  {
385  if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
386  {
387  mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
388  goto out;
389  }
390  }
391 
392  mutt_str_copy(buf, scratch, buflen);
393 
394  /* for cosmetics' sake, transform to lowercase. */
395  for (char *p = buf; *p; p++)
396  *p = tolower(*p);
397 
398 out:
399  if (ext && *ext)
400  {
401  mutt_str_cat(buf, buflen, "/");
402  mutt_str_cat(buf, buflen, ext);
403  }
404 }
const char * key
Definition: charset.c:86
static size_t plen
Length of cached packet.
Definition: pgppacket.c:39
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:888
const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:101
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:177
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:390
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:721
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp ( const char *  cs1,
const char *  cs2 
)

Are the names of two character sets equivalent?

Parameters
cs1First character set
cs2Second character set
Return values
trueNames are equivalent
falseNames differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 418 of file charset.c.

419 {
420  if (!cs1 || !cs2)
421  return false;
422 
423  char buf[256];
424 
425  mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
426 
427  int len1 = mutt_str_len(buf);
428  int len2 = mutt_str_len(cs2);
429 
430  return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
431  ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
432 }
#define MIN(a, b)
Definition: memory.h:31
bool mutt_istrn_equal(const char *a, const char *b, size_t l)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:626
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:352
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:636
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_default_charset()

char* mutt_ch_get_default_charset ( void  )

Get the default character set.

Return values
ptrName of the default character set
Warning
This returns a pointer to a static buffer. Do not free it.

Definition at line 440 of file charset.c.

441 {
442  static char fcharset[128];
443  const char *c = C_AssumedCharset;
444  const char *c1 = NULL;
445 
446  if (c)
447  {
448  c1 = strchr(c, ':');
449  mutt_str_copy(fcharset, c, c1 ? (c1 - c + 1) : sizeof(fcharset));
450  return fcharset;
451  }
452  return strcpy(fcharset, "us-ascii");
453 }
char * C_AssumedCharset
Config: If a message is missing a character set, assume this character set.
Definition: charset.c:52
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:721
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char* mutt_ch_get_langinfo_charset ( void  )

Get the user's choice of character set.

Return values
ptrCharset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 462 of file charset.c.

463 {
464  char buf[1024] = { 0 };
465 
466  mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
467 
468  if (buf[0] != '\0')
469  return mutt_str_dup(buf);
470 
471  return mutt_str_dup("iso-8859-1");
472 }
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:375
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:352
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add ( enum LookupType  type,
const char *  pat,
const char *  replace,
struct Buffer err 
)

Add a new character set lookup.

Parameters
typeType of character set, e.g. MUTT_LOOKUP_CHARSET
patPattern to match
replaceReplacement string
errBuffer for error message
Return values
trueLookup added to list
falseRegex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 485 of file charset.c.

487 {
488  if (!pat || !replace)
489  return false;
490 
491  regex_t *rx = mutt_mem_malloc(sizeof(regex_t));
492  int rc = REG_COMP(rx, pat, REG_ICASE);
493  if (rc != 0)
494  {
495  regerror(rc, rx, err->data, err->dsize);
496  FREE(&rx);
497  return false;
498  }
499 
500  struct Lookup *l = lookup_new();
501  l->type = type;
502  l->replacement = mutt_str_dup(replace);
503  l->regex.pattern = mutt_str_dup(pat);
504  l->regex.regex = rx;
505  l->regex.pat_not = false;
506 
507  TAILQ_INSERT_TAIL(&Lookups, l, entries);
508 
509  return true;
510 }
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:249
regex_t * regex
compiled expression
Definition: regex3.h:91
bool pat_not
do not match
Definition: regex3.h:92
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:375
char * replacement
Alternative charset to use.
Definition: charset.c:74
struct Regex regex
Regular expression.
Definition: charset.c:73
Regex to String lookup table.
Definition: charset.c:70
size_t dsize
Length of data.
Definition: buffer.h:37
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:53
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
char * data
Pointer to data.
Definition: buffer.h:35
static struct LookupList Lookups
Definition: charset.c:79
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:802
enum LookupType type
Lookup type.
Definition: charset.c:72
#define FREE(x)
Definition: memory.h:40
char * pattern
printable version
Definition: regex3.h:90
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void  )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 517 of file charset.c.

518 {
519  struct Lookup *l = NULL;
520  struct Lookup *tmp = NULL;
521 
522  TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
523  {
524  TAILQ_REMOVE(&Lookups, l, entries);
525  lookup_free(&l);
526  }
527 }
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:728
Regex to String lookup table.
Definition: charset.c:70
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:258
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:834
static struct LookupList Lookups
Definition: charset.c:79
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char* mutt_ch_charset_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'charset-hook' matches)
NULLNo matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 538 of file charset.c.

539 {
541 }
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:283
static char * chs
Definition: gnupgparse.c:73
Alias for another character set.
Definition: charset.h:68
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open ( const char *  tocode,
const char *  fromcode,
int  flags 
)

Set up iconv for conversions.

Parameters
tocodeCurrent character set
fromcodeTarget character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptriconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Note
By design charset-hooks should never be, and are never, applied to tocode.
The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 565 of file charset.c.

566 {
567  char tocode1[128];
568  char fromcode1[128];
569  const char *tocode2 = NULL, *fromcode2 = NULL;
570  const char *tmp = NULL;
571 
572  iconv_t cd;
573 
574  /* transform to MIME preferred charset names */
575  mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
576  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
577 
578  /* maybe apply charset-hooks and recanonicalise fromcode,
579  * but only when caller asked us to sanitize a potentially wrong
580  * charset name incoming from the wild exterior. */
581  if (flags & MUTT_ICONV_HOOK_FROM)
582  {
583  tmp = mutt_ch_charset_lookup(fromcode1);
584  if (tmp)
585  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
586  }
587 
588  /* always apply iconv-hooks to suit system's iconv tastes */
589  tocode2 = mutt_ch_iconv_lookup(tocode1);
590  tocode2 = tocode2 ? tocode2 : tocode1;
591  fromcode2 = mutt_ch_iconv_lookup(fromcode1);
592  fromcode2 = fromcode2 ? fromcode2 : fromcode1;
593 
594  /* call system iconv with names it appreciates */
595  cd = iconv_open(tocode2, fromcode2);
596  if (cd != (iconv_t) -1)
597  return cd;
598 
599  return (iconv_t) -1;
600 }
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:702
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:352
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:538
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv ( iconv_t  cd,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft,
const char **  inrepls,
const char *  outrepl,
int *  iconverrno 
)

Change the encoding of a string.

Parameters
[in]cdIconv conversion descriptor
[in,out]inbufBuffer to convert
[in,out]inbytesleftLength of buffer to convert
[in,out]outbufBuffer for the result
[in,out]outbytesleftLength of result buffer
[in]inreplsInput replacement characters
[in]outreplOutput replacement characters
[out]iconverrnoErrno if iconv() fails, 0 if it succeeds
Return values
numCharacters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 618 of file charset.c.

621 {
622  size_t rc = 0;
623  const char *ib = *inbuf;
624  size_t ibl = *inbytesleft;
625  char *ob = *outbuf;
626  size_t obl = *outbytesleft;
627 
628  while (true)
629  {
630  errno = 0;
631  const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
632  if (ret1 != (size_t) -1)
633  rc += ret1;
634  if (iconverrno)
635  *iconverrno = errno;
636 
637  if (ibl && obl && (errno == EILSEQ))
638  {
639  if (inrepls)
640  {
641  /* Try replacing the input */
642  const char **t = NULL;
643  for (t = inrepls; *t; t++)
644  {
645  const char *ib1 = *t;
646  size_t ibl1 = strlen(*t);
647  char *ob1 = ob;
648  size_t obl1 = obl;
649  iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
650  if (ibl1 == 0)
651  {
652  ib++;
653  ibl--;
654  ob = ob1;
655  obl = obl1;
656  rc++;
657  break;
658  }
659  }
660  if (*t)
661  continue;
662  }
663  /* Replace the output */
664  if (!outrepl)
665  outrepl = "?";
666  iconv(cd, NULL, NULL, &ob, &obl);
667  if (obl)
668  {
669  int n = strlen(outrepl);
670  if (n > obl)
671  {
672  outrepl = "?";
673  n = 1;
674  }
675  memcpy(ob, outrepl, n);
676  ib++;
677  ibl--;
678  ob += n;
679  obl -= n;
680  rc++;
681  iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
682  continue;
683  }
684  }
685  *inbuf = ib;
686  *inbytesleft = ibl;
687  *outbuf = ob;
688  *outbytesleft = obl;
689  return rc;
690  }
691 }
int n
Definition: acutest.h:492
#define EILSEQ
Definition: charset.c:49
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char* mutt_ch_iconv_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'iconv-hook' matches)
NULLNo matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 702 of file charset.c.

703 {
705 }
Character set conversion.
Definition: charset.h:69
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:283
static char * chs
Definition: gnupgparse.c:73
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check ( const char *  s,
size_t  slen,
const char *  from,
const char *  to 
)

Check whether a string can be converted between encodings.

Parameters
[in]sString to check
[in]slenLength of the string to check
[in]fromCurrent character set
[in]toTarget character set
Return values
0Success
-1Error in iconv_open()
>0Errno as set by iconv()

Definition at line 717 of file charset.c.

718 {
719  if (!s || !from || !to)
720  return -1;
721 
722  int rc = 0;
723  iconv_t cd = mutt_ch_iconv_open(to, from, 0);
724  if (cd == (iconv_t) -1)
725  return -1;
726 
727  size_t outlen = MB_LEN_MAX * slen;
728  char *out = mutt_mem_malloc(outlen + 1);
729  char *saved_out = out;
730 
731  const size_t convlen =
732  iconv(cd, (ICONV_CONST char **) &s, &slen, &out, (size_t *) &outlen);
733  if (convlen == -1)
734  rc = errno;
735 
736  FREE(&saved_out);
737  iconv_close(cd);
738  return rc;
739 }
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, int flags)
Set up iconv for conversions.
Definition: charset.c:565
#define FREE(x)
Definition: memory.h:40
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string ( char **  ps,
const char *  from,
const char *  to,
int  flags 
)

Convert a string between encodings.

Parameters
[in,out]psString to convert
[in]fromCurrent character set
[in]toTarget character set
[in]flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
0Success
-1Invalid arguments or failure to open an iconv channel
errnoFailure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 754 of file charset.c.

755 {
756  if (!ps)
757  return -1;
758 
759  char *s = *ps;
760 
761  if (!s || (*s == '\0'))
762  return 0;
763 
764  if (!to || !from)
765  return -1;
766 
767  const char *repls[] = { "\357\277\275", "?", 0 };
768  int rc = 0;
769 
770  iconv_t cd = mutt_ch_iconv_open(to, from, flags);
771  if (cd == (iconv_t) -1)
772  return -1;
773 
774  size_t len;
775  const char *ib = NULL;
776  char *buf = NULL, *ob = NULL;
777  size_t ibl, obl;
778  const char **inrepls = NULL;
779  const char *outrepl = NULL;
780 
781  if (mutt_ch_is_utf8(to))
782  outrepl = "\357\277\275";
783  else if (mutt_ch_is_utf8(from))
784  inrepls = repls;
785  else
786  outrepl = "?";
787 
788  len = strlen(s);
789  ib = s;
790  ibl = len + 1;
791  obl = MB_LEN_MAX * ibl;
792  buf = mutt_mem_malloc(obl + 1);
793  ob = buf;
794 
795  mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
796  iconv_close(cd);
797 
798  *ob = '\0';
799 
800  FREE(ps);
801  *ps = buf;
802 
803  mutt_str_adjust(ps);
804  return rc;
805 }
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
void mutt_str_adjust(char **p)
Shrink-to-fit a string.
Definition: string.c:495
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:618
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, int flags)
Set up iconv for conversions.
Definition: charset.c:565
#define FREE(x)
Definition: memory.h:40
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset ( const char *  cs,
bool  strict 
)

Does iconv understand a character set?

Parameters
csCharacter set to check
strictCheck strictly by using iconv
Return values
trueCharacter set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 818 of file charset.c.

819 {
820  if (!cs)
821  return false;
822 
823  if (mutt_ch_is_utf8(cs))
824  return true;
825 
826  if (!strict)
827  {
828  for (int i = 0; PreferredMimeNames[i].key; i++)
829  {
830  if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
831  mutt_istr_equal(PreferredMimeNames[i].pref, cs))
832  {
833  return true;
834  }
835  }
836  }
837 
838  iconv_t cd = mutt_ch_iconv_open(cs, cs, 0);
839  if (cd != (iconv_t)(-1))
840  {
841  iconv_close(cd);
842  return true;
843  }
844 
845  return false;
846 }
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
const char * key
Definition: charset.c:86
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:888
const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:101
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, int flags)
Set up iconv for conversions.
Definition: charset.c:565
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv* mutt_ch_fgetconv_open ( FILE *  fp,
const char *  from,
const char *  to,
int  flags 
)

Prepare a file for charset conversion.

Parameters
fpFILE ptr to prepare
fromCurrent character set
toDestination character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptrfgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 858 of file charset.c.

859 {
860  struct FgetConv *fc = NULL;
861  iconv_t cd = (iconv_t) -1;
862 
863  if (from && to)
864  cd = mutt_ch_iconv_open(to, from, flags);
865 
866  if (cd != (iconv_t) -1)
867  {
868  static const char *repls[] = { "\357\277\275", "?", 0 };
869 
870  fc = mutt_mem_malloc(sizeof(struct FgetConv));
871  fc->p = fc->bufo;
872  fc->ob = fc->bufo;
873  fc->ib = fc->bufi;
874  fc->ibl = 0;
875  fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
876  }
877  else
878  fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
879  fc->fp = fp;
880  fc->cd = cd;
881  return fc;
882 }
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
char bufi[512]
Definition: charset.h:45
size_t ibl
Definition: charset.h:50
A dummy converter.
Definition: charset.h:57
FILE * fp
Definition: charset.h:43
iconv_t cd
Definition: charset.h:44
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, int flags)
Set up iconv for conversions.
Definition: charset.c:565
char * ib
Definition: charset.h:49
char * p
Definition: charset.h:47
char bufo[512]
Definition: charset.h:46
Cursor for converting a file&#39;s encoding.
Definition: charset.h:41
char * ob
Definition: charset.h:48
const char ** inrepls
Definition: charset.h:51
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv **  fc)

Close an fgetconv handle.

Parameters
[out]fcfgetconv handle

Definition at line 888 of file charset.c.

889 {
890  if (!fc || !*fc)
891  return;
892 
893  if ((*fc)->cd != (iconv_t) -1)
894  iconv_close((*fc)->cd);
895  FREE(fc);
896 }
#define FREE(x)
Definition: memory.h:40
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv fc)

Convert a file's character set.

Parameters
fcFgetConv handle
Return values
numNext character in the converted file
EOFError

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 908 of file charset.c.

909 {
910  if (!fc)
911  return EOF;
912  if (fc->cd == (iconv_t) -1)
913  return fgetc(fc->fp);
914  if (!fc->p)
915  return EOF;
916  if (fc->p < fc->ob)
917  return (unsigned char) *(fc->p)++;
918 
919  /* Try to convert some more */
920  fc->p = fc->bufo;
921  fc->ob = fc->bufo;
922  if (fc->ibl)
923  {
924  size_t obl = sizeof(fc->bufo);
925  iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
926  if (fc->p < fc->ob)
927  return (unsigned char) *(fc->p)++;
928  }
929 
930  /* If we trusted iconv a bit more, we would at this point
931  * ask why it had stopped converting ... */
932 
933  /* Try to read some more */
934  if ((fc->ibl == sizeof(fc->bufi)) ||
935  (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
936  {
937  fc->p = 0;
938  return EOF;
939  }
940  if (fc->ibl)
941  memcpy(fc->bufi, fc->ib, fc->ibl);
942  fc->ib = fc->bufi;
943  fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
944 
945  /* Try harder this time to convert some */
946  if (fc->ibl)
947  {
948  size_t obl = sizeof(fc->bufo);
949  mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
950  fc->inrepls, 0, NULL);
951  if (fc->p < fc->ob)
952  return (unsigned char) *(fc->p)++;
953  }
954 
955  /* Either the file has finished or one of the buffers is too small */
956  fc->p = 0;
957  return EOF;
958 }
char bufi[512]
Definition: charset.h:45
size_t ibl
Definition: charset.h:50
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:618
FILE * fp
Definition: charset.h:43
iconv_t cd
Definition: charset.h:44
char * ib
Definition: charset.h:49
char * p
Definition: charset.h:47
char bufo[512]
Definition: charset.h:46
char * ob
Definition: charset.h:48
const char ** inrepls
Definition: charset.h:51
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char* mutt_ch_fgetconvs ( char *  buf,
size_t  buflen,
struct FgetConv fc 
)

Convert a file's charset into a string buffer.

Parameters
bufBuffer for result
buflenLength of buffer
fcFgetConv handle
Return values
ptrSuccess, result buffer
NULLError

Read a file into a buffer, converting the character set as it goes.

Definition at line 970 of file charset.c.

971 {
972  if (!buf)
973  return NULL;
974 
975  size_t r;
976  for (r = 0; (r + 1) < buflen;)
977  {
978  const int c = mutt_ch_fgetconv(fc);
979  if (c == EOF)
980  break;
981  buf[r++] = (char) c;
982  if (c == '\n')
983  break;
984  }
985  buf[r] = '\0';
986 
987  if (r > 0)
988  return buf;
989 
990  return NULL;
991 }
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file&#39;s character set.
Definition: charset.c:908
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char *  charset)

Update the records for a new character set.

Parameters
charsetNew character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note
This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1003 of file charset.c.

1004 {
1005  char buf[256];
1006 
1007  mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1008 
1009  if (mutt_ch_is_utf8(buf))
1010  {
1011  CharsetIsUtf8 = true;
1012  ReplacementChar = 0xfffd; /* replacement character */
1013  }
1014  else
1015  {
1016  CharsetIsUtf8 = false;
1017  ReplacementChar = '?';
1018  }
1019 
1020 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1021  bind_textdomain_codeset(PACKAGE, buf);
1022 #endif
1023 }
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
wchar_t ReplacementChar
When a Unicode character can&#39;t be displayed, use this instead.
Definition: charset.c:58
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:352
bool CharsetIsUtf8
Is the user&#39;s current character set utf-8?
Definition: charset.c:63
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_choose()

char* mutt_ch_choose ( const char *  fromcode,
const char *  charsets,
const char *  u,
size_t  ulen,
char **  d,
size_t *  dlen 
)

Figure the best charset to encode a string.

Parameters
[in]fromcodeOriginal charset of the string
[in]charsetsColon-separated list of potential charsets to use
[in]uString to encode
[in]ulenLength of the string to encode
[out]dIf not NULL, point it to the converted string
[out]dlenIf not NULL, point it to the length of the d string
Return values
ptrBest performing charset
NULLNone could be found

Definition at line 1036 of file charset.c.

1038 {
1039  if (!fromcode)
1040  return NULL;
1041 
1042  char *e = NULL, *tocode = NULL;
1043  size_t elen = 0, bestn = 0;
1044  const char *q = NULL;
1045 
1046  for (const char *p = charsets; p; p = q ? q + 1 : 0)
1047  {
1048  q = strchr(p, ':');
1049 
1050  size_t n = q ? q - p : strlen(p);
1051  if (n == 0)
1052  continue;
1053 
1054  char *t = mutt_mem_malloc(n + 1);
1055  memcpy(t, p, n);
1056  t[n] = '\0';
1057 
1058  char *s = mutt_strn_dup(u, ulen);
1059  const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, 0) :
1060  mutt_ch_check(s, ulen, fromcode, t);
1061  if (rc)
1062  {
1063  FREE(&t);
1064  FREE(&s);
1065  continue;
1066  }
1067  size_t slen = mutt_str_len(s);
1068 
1069  if (!tocode || (n < bestn))
1070  {
1071  bestn = n;
1072  FREE(&tocode);
1073  tocode = t;
1074  if (d)
1075  {
1076  FREE(&e);
1077  e = s;
1078  }
1079  else
1080  FREE(&s);
1081  elen = slen;
1082  }
1083  else
1084  {
1085  FREE(&t);
1086  FREE(&s);
1087  }
1088  }
1089  if (tocode)
1090  {
1091  if (d)
1092  *d = e;
1093  if (dlen)
1094  *dlen = elen;
1095 
1096  char canonical_buf[1024];
1097  mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1098  mutt_str_replace(&tocode, canonical_buf);
1099  }
1100  return tocode;
1101 }
int mutt_ch_convert_string(char **ps, const char *from, const char *to, int flags)
Convert a string between encodings.
Definition: charset.c:754
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:553
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:352
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:717
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
char * p
Definition: charset.h:47
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:636
int n
Definition: acutest.h:492
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:451
#define FREE(x)
Definition: memory.h:40
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ C_AssumedCharset

char* C_AssumedCharset

Config: If a message is missing a character set, assume this character set.

Definition at line 52 of file charset.c.

◆ C_Charset

char* C_Charset

Config: Default character set for displaying text on screen.

Definition at line 53 of file charset.c.

◆ ReplacementChar

wchar_t ReplacementChar = '?'

When a Unicode character can't be displayed, use this instead.

Definition at line 58 of file charset.c.

◆ CharsetIsUtf8

bool CharsetIsUtf8 = false

Is the user's current character set utf-8?

Definition at line 63 of file charset.c.

◆ Lookups

struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
static

Definition at line 79 of file charset.c.

◆ PreferredMimeNames

const struct MimeNames PreferredMimeNames[]

Lookup table of preferred charsets.

The following list has been created manually from the data under: http://www.isi.edu/in-notes/iana/assignments/character-sets Last update: 2000-09-07

Note
It includes only the subset of character sets for which a preferred MIME name is given.

Definition at line 101 of file charset.c.