NeoMutt  2021-02-05-329-g9e03b7
Teaching an old dog new tricks
DOXYGEN
charset.c File Reference

Conversion between different character encodings. More...

#include "config.h"
#include <ctype.h>
#include <errno.h>
#include <iconv.h>
#include <langinfo.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include "config/lib.h"
#include "core/lib.h"
#include "charset.h"
#include "buffer.h"
#include "memory.h"
#include "queue.h"
#include "regex3.h"
#include "string2.h"
#include <libintl.h>
+ Include dependency graph for charset.c:

Go to the source code of this file.

Data Structures

struct  Lookup
 Regex to String lookup table. More...
 
struct  MimeNames
 MIME name lookup entry. More...
 

Macros

#define EILSEQ   EINVAL
 

Functions

 TAILQ_HEAD (LookupList, Lookup)
 
static struct Lookuplookup_new (void)
 Create a new Lookup. More...
 
static void lookup_free (struct Lookup **ptr)
 Free a Lookup. More...
 
static const char * lookup_charset (enum LookupType type, const char *cs)
 Look for a preferred character set name. More...
 
int mutt_ch_convert_nonmime_string (char **ps)
 Try to convert a string using a list of character sets. More...
 
void mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name)
 Canonicalise the charset of a string. More...
 
bool mutt_ch_chscmp (const char *cs1, const char *cs2)
 Are the names of two character sets equivalent? More...
 
char * mutt_ch_get_default_charset (void)
 Get the default character set. More...
 
char * mutt_ch_get_langinfo_charset (void)
 Get the user's choice of character set. More...
 
bool mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
 Add a new character set lookup. More...
 
void mutt_ch_lookup_remove (void)
 Remove all the character set lookups. More...
 
const char * mutt_ch_charset_lookup (const char *chs)
 Look for a replacement character set. More...
 
iconv_t mutt_ch_iconv_open (const char *tocode, const char *fromcode, uint8_t flags)
 Set up iconv for conversions. More...
 
size_t mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
 Change the encoding of a string. More...
 
const char * mutt_ch_iconv_lookup (const char *chs)
 Look for a replacement character set. More...
 
int mutt_ch_check (const char *s, size_t slen, const char *from, const char *to)
 Check whether a string can be converted between encodings. More...
 
int mutt_ch_convert_string (char **ps, const char *from, const char *to, uint8_t flags)
 Convert a string between encodings. More...
 
bool mutt_ch_check_charset (const char *cs, bool strict)
 Does iconv understand a character set? More...
 
struct FgetConvmutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, uint8_t flags)
 Prepare a file for charset conversion. More...
 
void mutt_ch_fgetconv_close (struct FgetConv **fc)
 Close an fgetconv handle. More...
 
int mutt_ch_fgetconv (struct FgetConv *fc)
 Convert a file's character set. More...
 
char * mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc)
 Convert a file's charset into a string buffer. More...
 
void mutt_ch_set_charset (const char *charset)
 Update the records for a new character set. More...
 
char * mutt_ch_choose (const char *fromcode, const char *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
 Figure the best charset to encode a string. More...
 

Variables

wchar_t ReplacementChar = '?'
 When a Unicode character can't be displayed, use this instead. More...
 
bool CharsetIsUtf8 = false
 Is the user's current character set utf-8? More...
 
static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
 
const struct MimeNames PreferredMimeNames []
 Lookup table of preferred charsets. More...
 

Detailed Description

Conversion between different character encodings.

Authors
  • Thomas Roessler

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.c.

Macro Definition Documentation

◆ EILSEQ

#define EILSEQ   EINVAL

Definition at line 51 of file charset.c.

Function Documentation

◆ TAILQ_HEAD()

TAILQ_HEAD ( LookupList  ,
Lookup   
)

◆ lookup_new()

static struct Lookup* lookup_new ( void  )
static

Create a new Lookup.

Return values
ptrNew Lookup

Definition at line 248 of file charset.c.

249 {
250  return mutt_mem_calloc(1, sizeof(struct Lookup));
251 }
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
Regex to String lookup table.
Definition: charset.c:69
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ lookup_free()

static void lookup_free ( struct Lookup **  ptr)
static

Free a Lookup.

Parameters
ptrLookup to free

Definition at line 257 of file charset.c.

258 {
259  if (!ptr || !*ptr)
260  return;
261 
262  struct Lookup *l = *ptr;
263  FREE(&l->replacement);
264  FREE(&l->regex.pattern);
265  if (l->regex.regex)
266  regfree(l->regex.regex);
267  FREE(&l->regex.regex);
268  FREE(&l->regex);
269 
270  FREE(ptr);
271 }
regex_t * regex
compiled expression
Definition: regex3.h:92
char * replacement
Alternative charset to use.
Definition: charset.c:73
struct Regex regex
Regular expression.
Definition: charset.c:72
Regex to String lookup table.
Definition: charset.c:69
#define FREE(x)
Definition: memory.h:40
char * pattern
printable version
Definition: regex3.h:91
+ Here is the caller graph for this function:

◆ lookup_charset()

static const char* lookup_charset ( enum LookupType  type,
const char *  cs 
)
static

Look for a preferred character set name.

Parameters
typeType, e.g. MUTT_LOOKUP_CHARSET
csCharacter set
Return values
ptrCharset string

If the character set matches one of the regexes, then return the replacement name.

Definition at line 282 of file charset.c.

283 {
284  if (!cs)
285  return NULL;
286 
287  struct Lookup *l = NULL;
288 
289  TAILQ_FOREACH(l, &Lookups, entries)
290  {
291  if (l->type != type)
292  continue;
293  if (mutt_regex_match(&l->regex, cs))
294  return l->replacement;
295  }
296  return NULL;
297 }
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:718
char * replacement
Alternative charset to use.
Definition: charset.c:73
struct Regex regex
Regular expression.
Definition: charset.c:72
Regex to String lookup table.
Definition: charset.c:69
enum LookupType type
Lookup type.
Definition: charset.c:71
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:613
static struct LookupList Lookups
Definition: charset.c:78
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string ( char **  ps)

Try to convert a string using a list of character sets.

Parameters
[in,out]psString to be converted
Return values
0Success
-1Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 308 of file charset.c.

309 {
310  if (!ps)
311  return -1;
312 
313  char *u = *ps;
314  const size_t ulen = mutt_str_len(u);
315  if (ulen == 0)
316  return 0;
317 
318  const char *c1 = NULL;
319 
320  const char *const c_assumed_charset =
321  cs_subset_string(NeoMutt->sub, "assumed_charset");
322  const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
323  for (const char *c = c_assumed_charset; c; c = c1 ? c1 + 1 : 0)
324  {
325  c1 = strchr(c, ':');
326  size_t n = c1 ? c1 - c : mutt_str_len(c);
327  if (n == 0)
328  return 0;
329  char *fromcode = mutt_mem_malloc(n + 1);
330  mutt_str_copy(fromcode, c, n + 1);
331  char *s = mutt_strn_dup(u, ulen);
332  int m = mutt_ch_convert_string(&s, fromcode, c_charset, MUTT_ICONV_NO_FLAGS);
333  FREE(&fromcode);
334  FREE(&s);
335  if (m == 0)
336  {
337  return 0;
338  }
339  }
341  c_charset, MUTT_ICONV_HOOK_FROM);
342  return -1;
343 }
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:758
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
Container for Accounts, Notifications.
Definition: neomutt.h:36
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:548
char * mutt_ch_get_default_charset(void)
Get the default character set.
Definition: charset.c:442
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
const char * cs_subset_string(const struct ConfigSubset *sub, const char *name)
Get a string config item by name.
Definition: helpers.c:295
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:631
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:716
#define FREE(x)
Definition: memory.h:40
struct ConfigSubset * sub
Inherited config items.
Definition: neomutt.h:39
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset ( char *  buf,
size_t  buflen,
const char *  name 
)

Canonicalise the charset of a string.

Parameters
bufBuffer for canonical character set name
buflenLength of buffer
nameName to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 354 of file charset.c.

355 {
356  if (!buf || !name)
357  return;
358 
359  char in[1024], scratch[1024];
360 
361  mutt_str_copy(in, name, sizeof(in));
362  char *ext = strchr(in, '/');
363  if (ext)
364  *ext++ = '\0';
365 
366  if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
367  {
368  mutt_str_copy(buf, "utf-8", buflen);
369  goto out;
370  }
371 
372  /* catch some common iso-8859-something misspellings */
373  size_t plen;
374  if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
375  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
376  else if ((plen = mutt_istr_startswith(in, "8859-")))
377  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
378  else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
379  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
380  else if ((plen = mutt_istr_startswith(in, "iso8859-")))
381  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
382  else
383  mutt_str_copy(scratch, in, sizeof(scratch));
384 
385  for (size_t i = 0; PreferredMimeNames[i].key; i++)
386  {
387  if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
388  {
389  mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
390  goto out;
391  }
392  }
393 
394  mutt_str_copy(buf, scratch, buflen);
395 
396  /* for cosmetics' sake, transform to lowercase. */
397  for (char *p = buf; *p; p++)
398  *p = tolower(*p);
399 
400 out:
401  if (ext && *ext)
402  {
403  mutt_str_cat(buf, buflen, "/");
404  mutt_str_cat(buf, buflen, ext);
405  }
406 }
const char * key
Definition: charset.c:85
static size_t plen
Length of cached packet.
Definition: pgppacket.c:39
const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:100
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:883
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:172
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:385
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:716
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp ( const char *  cs1,
const char *  cs2 
)

Are the names of two character sets equivalent?

Parameters
cs1First character set
cs2Second character set
Return values
trueNames are equivalent
falseNames differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 420 of file charset.c.

421 {
422  if (!cs1 || !cs2)
423  return false;
424 
425  char buf[256];
426 
427  mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
428 
429  int len1 = mutt_str_len(buf);
430  int len2 = mutt_str_len(cs2);
431 
432  return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
433  ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
434 }
#define MIN(a, b)
Definition: memory.h:31
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:621
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:631
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:354
+ Here is the call graph for this function:

◆ mutt_ch_get_default_charset()

char* mutt_ch_get_default_charset ( void  )

Get the default character set.

Return values
ptrName of the default character set
Warning
This returns a pointer to a static buffer. Do not free it.

Definition at line 442 of file charset.c.

443 {
444  static char fcharset[128];
445  const char *const c_assumed_charset =
446  cs_subset_string(NeoMutt->sub, "assumed_charset");
447  const char *c = c_assumed_charset;
448  const char *c1 = NULL;
449 
450  if (c)
451  {
452  c1 = strchr(c, ':');
453  mutt_str_copy(fcharset, c, c1 ? (c1 - c + 1) : sizeof(fcharset));
454  return fcharset;
455  }
456  return strcpy(fcharset, "us-ascii");
457 }
Container for Accounts, Notifications.
Definition: neomutt.h:36
const char * cs_subset_string(const struct ConfigSubset *sub, const char *name)
Get a string config item by name.
Definition: helpers.c:295
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:716
struct ConfigSubset * sub
Inherited config items.
Definition: neomutt.h:39
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char* mutt_ch_get_langinfo_charset ( void  )

Get the user's choice of character set.

Return values
ptrCharset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 466 of file charset.c.

467 {
468  char buf[1024] = { 0 };
469 
470  mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
471 
472  if (buf[0] != '\0')
473  return mutt_str_dup(buf);
474 
475  return mutt_str_dup("iso-8859-1");
476 }
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:370
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:354
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add ( enum LookupType  type,
const char *  pat,
const char *  replace,
struct Buffer err 
)

Add a new character set lookup.

Parameters
typeType of character set, e.g. MUTT_LOOKUP_CHARSET
patPattern to match
replaceReplacement string
errBuffer for error message
Return values
trueLookup added to list
falseRegex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 489 of file charset.c.

491 {
492  if (!pat || !replace)
493  return false;
494 
495  regex_t *rx = mutt_mem_malloc(sizeof(regex_t));
496  int rc = REG_COMP(rx, pat, REG_ICASE);
497  if (rc != 0)
498  {
499  regerror(rc, rx, err->data, err->dsize);
500  FREE(&rx);
501  return false;
502  }
503 
504  struct Lookup *l = lookup_new();
505  l->type = type;
506  l->replacement = mutt_str_dup(replace);
507  l->regex.pattern = mutt_str_dup(pat);
508  l->regex.regex = rx;
509  l->regex.pat_not = false;
510 
511  TAILQ_INSERT_TAIL(&Lookups, l, entries);
512 
513  return true;
514 }
regex_t * regex
compiled expression
Definition: regex3.h:92
bool pat_not
do not match
Definition: regex3.h:93
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:370
char * replacement
Alternative charset to use.
Definition: charset.c:73
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:54
struct Regex regex
Regular expression.
Definition: charset.c:72
Regex to String lookup table.
Definition: charset.c:69
size_t dsize
Length of data.
Definition: buffer.h:37
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:248
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
char * data
Pointer to data.
Definition: buffer.h:35
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:802
enum LookupType type
Lookup type.
Definition: charset.c:71
#define FREE(x)
Definition: memory.h:40
static struct LookupList Lookups
Definition: charset.c:78
char * pattern
printable version
Definition: regex3.h:91
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void  )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 521 of file charset.c.

522 {
523  struct Lookup *l = NULL;
524  struct Lookup *tmp = NULL;
525 
526  TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
527  {
528  TAILQ_REMOVE(&Lookups, l, entries);
529  lookup_free(&l);
530  }
531 }
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:728
Regex to String lookup table.
Definition: charset.c:69
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:834
static struct LookupList Lookups
Definition: charset.c:78
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:257
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char* mutt_ch_charset_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'charset-hook' matches)
NULLNo matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 542 of file charset.c.

543 {
545 }
static char * chs
Definition: gnupgparse.c:73
Alias for another character set.
Definition: charset.h:67
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:282
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open ( const char *  tocode,
const char *  fromcode,
uint8_t  flags 
)

Set up iconv for conversions.

Parameters
tocodeCurrent character set
fromcodeTarget character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptriconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Note
By design charset-hooks should never be, and are never, applied to tocode.
The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 569 of file charset.c.

570 {
571  char tocode1[128];
572  char fromcode1[128];
573  const char *tocode2 = NULL, *fromcode2 = NULL;
574  const char *tmp = NULL;
575 
576  iconv_t cd;
577 
578  /* transform to MIME preferred charset names */
579  mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
580  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
581 
582  /* maybe apply charset-hooks and recanonicalise fromcode,
583  * but only when caller asked us to sanitize a potentially wrong
584  * charset name incoming from the wild exterior. */
585  if (flags & MUTT_ICONV_HOOK_FROM)
586  {
587  tmp = mutt_ch_charset_lookup(fromcode1);
588  if (tmp)
589  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
590  }
591 
592  /* always apply iconv-hooks to suit system's iconv tastes */
593  tocode2 = mutt_ch_iconv_lookup(tocode1);
594  tocode2 = tocode2 ? tocode2 : tocode1;
595  fromcode2 = mutt_ch_iconv_lookup(fromcode1);
596  fromcode2 = fromcode2 ? fromcode2 : fromcode1;
597 
598  /* call system iconv with names it appreciates */
599  cd = iconv_open(tocode2, fromcode2);
600  if (cd != (iconv_t) -1)
601  return cd;
602 
603  return (iconv_t) -1;
604 }
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:706
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:542
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:354
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv ( iconv_t  cd,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft,
const char **  inrepls,
const char *  outrepl,
int *  iconverrno 
)

Change the encoding of a string.

Parameters
[in]cdIconv conversion descriptor
[in,out]inbufBuffer to convert
[in,out]inbytesleftLength of buffer to convert
[in,out]outbufBuffer for the result
[in,out]outbytesleftLength of result buffer
[in]inreplsInput replacement characters
[in]outreplOutput replacement characters
[out]iconverrnoErrno if iconv() fails, 0 if it succeeds
Return values
numCharacters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 622 of file charset.c.

625 {
626  size_t rc = 0;
627  const char *ib = *inbuf;
628  size_t ibl = *inbytesleft;
629  char *ob = *outbuf;
630  size_t obl = *outbytesleft;
631 
632  while (true)
633  {
634  errno = 0;
635  const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
636  if (ret1 != (size_t) -1)
637  rc += ret1;
638  if (iconverrno)
639  *iconverrno = errno;
640 
641  if (ibl && obl && (errno == EILSEQ))
642  {
643  if (inrepls)
644  {
645  /* Try replacing the input */
646  const char **t = NULL;
647  for (t = inrepls; *t; t++)
648  {
649  const char *ib1 = *t;
650  size_t ibl1 = strlen(*t);
651  char *ob1 = ob;
652  size_t obl1 = obl;
653  iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
654  if (ibl1 == 0)
655  {
656  ib++;
657  ibl--;
658  ob = ob1;
659  obl = obl1;
660  rc++;
661  break;
662  }
663  }
664  if (*t)
665  continue;
666  }
667  /* Replace the output */
668  if (!outrepl)
669  outrepl = "?";
670  iconv(cd, NULL, NULL, &ob, &obl);
671  if (obl)
672  {
673  int n = strlen(outrepl);
674  if (n > obl)
675  {
676  outrepl = "?";
677  n = 1;
678  }
679  memcpy(ob, outrepl, n);
680  ib++;
681  ibl--;
682  ob += n;
683  obl -= n;
684  rc++;
685  iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
686  continue;
687  }
688  }
689  *inbuf = ib;
690  *inbytesleft = ibl;
691  *outbuf = ob;
692  *outbytesleft = obl;
693  return rc;
694  }
695 }
#define EILSEQ
Definition: charset.c:51
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char* mutt_ch_iconv_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'iconv-hook' matches)
NULLNo matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 706 of file charset.c.

707 {
709 }
Character set conversion.
Definition: charset.h:68
static char * chs
Definition: gnupgparse.c:73
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:282
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check ( const char *  s,
size_t  slen,
const char *  from,
const char *  to 
)

Check whether a string can be converted between encodings.

Parameters
[in]sString to check
[in]slenLength of the string to check
[in]fromCurrent character set
[in]toTarget character set
Return values
0Success
-1Error in iconv_open()
>0Errno as set by iconv()

Definition at line 721 of file charset.c.

722 {
723  if (!s || !from || !to)
724  return -1;
725 
726  int rc = 0;
727  iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
728  if (cd == (iconv_t) -1)
729  return -1;
730 
731  size_t outlen = MB_LEN_MAX * slen;
732  char *out = mutt_mem_malloc(outlen + 1);
733  char *saved_out = out;
734 
735  const size_t convlen =
736  iconv(cd, (ICONV_CONST char **) &s, &slen, &out, (size_t *) &outlen);
737  if (convlen == -1)
738  rc = errno;
739 
740  FREE(&saved_out);
741  iconv_close(cd);
742  return rc;
743 }
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:569
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
#define FREE(x)
Definition: memory.h:40
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string ( char **  ps,
const char *  from,
const char *  to,
uint8_t  flags 
)

Convert a string between encodings.

Parameters
[in,out]psString to convert
[in]fromCurrent character set
[in]toTarget character set
[in]flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
0Success
-1Invalid arguments or failure to open an iconv channel
errnoFailure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 758 of file charset.c.

759 {
760  if (!ps)
761  return -1;
762 
763  char *s = *ps;
764 
765  if (!s || (*s == '\0'))
766  return 0;
767 
768  if (!to || !from)
769  return -1;
770 
771  const char *repls[] = { "\357\277\275", "?", 0 };
772  int rc = 0;
773 
774  iconv_t cd = mutt_ch_iconv_open(to, from, flags);
775  if (cd == (iconv_t) -1)
776  return -1;
777 
778  size_t len;
779  const char *ib = NULL;
780  char *buf = NULL, *ob = NULL;
781  size_t ibl, obl;
782  const char **inrepls = NULL;
783  const char *outrepl = NULL;
784 
785  if (mutt_ch_is_utf8(to))
786  outrepl = "\357\277\275";
787  else if (mutt_ch_is_utf8(from))
788  inrepls = repls;
789  else
790  outrepl = "?";
791 
792  len = strlen(s);
793  ib = s;
794  ibl = len + 1;
795  obl = MB_LEN_MAX * ibl;
796  buf = mutt_mem_malloc(obl + 1);
797  ob = buf;
798 
799  mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
800  iconv_close(cd);
801 
802  *ob = '\0';
803 
804  FREE(ps);
805  *ps = buf;
806 
807  mutt_str_adjust(ps);
808  return rc;
809 }
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:569
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:490
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:622
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
#define FREE(x)
Definition: memory.h:40
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset ( const char *  cs,
bool  strict 
)

Does iconv understand a character set?

Parameters
csCharacter set to check
strictCheck strictly by using iconv
Return values
trueCharacter set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 822 of file charset.c.

823 {
824  if (!cs)
825  return false;
826 
827  if (mutt_ch_is_utf8(cs))
828  return true;
829 
830  if (!strict)
831  {
832  for (int i = 0; PreferredMimeNames[i].key; i++)
833  {
834  if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
835  mutt_istr_equal(PreferredMimeNames[i].pref, cs))
836  {
837  return true;
838  }
839  }
840  }
841 
842  iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
843  if (cd != (iconv_t) (-1))
844  {
845  iconv_close(cd);
846  return true;
847  }
848 
849  return false;
850 }
const char * key
Definition: charset.c:85
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:569
const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:100
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:883
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv* mutt_ch_fgetconv_open ( FILE *  fp,
const char *  from,
const char *  to,
uint8_t  flags 
)

Prepare a file for charset conversion.

Parameters
fpFILE ptr to prepare
fromCurrent character set
toDestination character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptrfgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 862 of file charset.c.

863 {
864  struct FgetConv *fc = NULL;
865  iconv_t cd = (iconv_t) -1;
866 
867  if (from && to)
868  cd = mutt_ch_iconv_open(to, from, flags);
869 
870  if (cd != (iconv_t) -1)
871  {
872  static const char *repls[] = { "\357\277\275", "?", 0 };
873 
874  fc = mutt_mem_malloc(sizeof(struct FgetConv));
875  fc->p = fc->bufo;
876  fc->ob = fc->bufo;
877  fc->ib = fc->bufi;
878  fc->ibl = 0;
879  fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
880  }
881  else
882  fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
883  fc->fp = fp;
884  fc->cd = cd;
885  return fc;
886 }
char bufi[512]
Definition: charset.h:44
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:569
size_t ibl
Definition: charset.h:49
A dummy converter.
Definition: charset.h:56
FILE * fp
Definition: charset.h:42
iconv_t cd
Definition: charset.h:43
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
char * ib
Definition: charset.h:48
char * p
Definition: charset.h:46
char bufo[512]
Definition: charset.h:45
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
Cursor for converting a file&#39;s encoding.
Definition: charset.h:40
char * ob
Definition: charset.h:47
const char ** inrepls
Definition: charset.h:50
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv **  fc)

Close an fgetconv handle.

Parameters
[out]fcfgetconv handle

Definition at line 892 of file charset.c.

893 {
894  if (!fc || !*fc)
895  return;
896 
897  if ((*fc)->cd != (iconv_t) -1)
898  iconv_close((*fc)->cd);
899  FREE(fc);
900 }
#define FREE(x)
Definition: memory.h:40
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv fc)

Convert a file's character set.

Parameters
fcFgetConv handle
Return values
numNext character in the converted file
EOFError

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 912 of file charset.c.

913 {
914  if (!fc)
915  return EOF;
916  if (fc->cd == (iconv_t) -1)
917  return fgetc(fc->fp);
918  if (!fc->p)
919  return EOF;
920  if (fc->p < fc->ob)
921  return (unsigned char) *(fc->p)++;
922 
923  /* Try to convert some more */
924  fc->p = fc->bufo;
925  fc->ob = fc->bufo;
926  if (fc->ibl)
927  {
928  size_t obl = sizeof(fc->bufo);
929  iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
930  if (fc->p < fc->ob)
931  return (unsigned char) *(fc->p)++;
932  }
933 
934  /* If we trusted iconv a bit more, we would at this point
935  * ask why it had stopped converting ... */
936 
937  /* Try to read some more */
938  if ((fc->ibl == sizeof(fc->bufi)) ||
939  (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
940  {
941  fc->p = 0;
942  return EOF;
943  }
944  if (fc->ibl)
945  memcpy(fc->bufi, fc->ib, fc->ibl);
946  fc->ib = fc->bufi;
947  fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
948 
949  /* Try harder this time to convert some */
950  if (fc->ibl)
951  {
952  size_t obl = sizeof(fc->bufo);
953  mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
954  fc->inrepls, 0, NULL);
955  if (fc->p < fc->ob)
956  return (unsigned char) *(fc->p)++;
957  }
958 
959  /* Either the file has finished or one of the buffers is too small */
960  fc->p = 0;
961  return EOF;
962 }
char bufi[512]
Definition: charset.h:44
size_t ibl
Definition: charset.h:49
FILE * fp
Definition: charset.h:42
iconv_t cd
Definition: charset.h:43
char * ib
Definition: charset.h:48
char * p
Definition: charset.h:46
char bufo[512]
Definition: charset.h:45
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:622
char * ob
Definition: charset.h:47
const char ** inrepls
Definition: charset.h:50
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char* mutt_ch_fgetconvs ( char *  buf,
size_t  buflen,
struct FgetConv fc 
)

Convert a file's charset into a string buffer.

Parameters
bufBuffer for result
buflenLength of buffer
fcFgetConv handle
Return values
ptrSuccess, result buffer
NULLError

Read a file into a buffer, converting the character set as it goes.

Definition at line 974 of file charset.c.

975 {
976  if (!buf)
977  return NULL;
978 
979  size_t r;
980  for (r = 0; (r + 1) < buflen;)
981  {
982  const int c = mutt_ch_fgetconv(fc);
983  if (c == EOF)
984  break;
985  buf[r++] = (char) c;
986  if (c == '\n')
987  break;
988  }
989  buf[r] = '\0';
990 
991  if (r > 0)
992  return buf;
993 
994  return NULL;
995 }
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file&#39;s character set.
Definition: charset.c:912
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char *  charset)

Update the records for a new character set.

Parameters
charsetNew character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note
This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1007 of file charset.c.

1008 {
1009  char buf[256];
1010 
1011  mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1012 
1013  if (mutt_ch_is_utf8(buf))
1014  {
1015  CharsetIsUtf8 = true;
1016  ReplacementChar = 0xfffd; /* replacement character */
1017  }
1018  else
1019  {
1020  CharsetIsUtf8 = false;
1021  ReplacementChar = '?';
1022  }
1023 
1024 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1025  bind_textdomain_codeset(PACKAGE, buf);
1026 #endif
1027 }
wchar_t ReplacementChar
When a Unicode character can&#39;t be displayed, use this instead.
Definition: charset.c:57
bool CharsetIsUtf8
Is the user&#39;s current character set utf-8?
Definition: charset.c:62
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:354
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_choose()

char* mutt_ch_choose ( const char *  fromcode,
const char *  charsets,
const char *  u,
size_t  ulen,
char **  d,
size_t *  dlen 
)

Figure the best charset to encode a string.

Parameters
[in]fromcodeOriginal charset of the string
[in]charsetsColon-separated list of potential charsets to use
[in]uString to encode
[in]ulenLength of the string to encode
[out]dIf not NULL, point it to the converted string
[out]dlenIf not NULL, point it to the length of the d string
Return values
ptrBest performing charset
NULLNone could be found

Definition at line 1040 of file charset.c.

1042 {
1043  if (!fromcode)
1044  return NULL;
1045 
1046  char *e = NULL, *tocode = NULL;
1047  size_t elen = 0, bestn = 0;
1048  const char *q = NULL;
1049 
1050  for (const char *p = charsets; p; p = q ? q + 1 : 0)
1051  {
1052  q = strchr(p, ':');
1053 
1054  size_t n = q ? q - p : strlen(p);
1055  if (n == 0)
1056  continue;
1057 
1058  char *t = mutt_mem_malloc(n + 1);
1059  memcpy(t, p, n);
1060  t[n] = '\0';
1061 
1062  char *s = mutt_strn_dup(u, ulen);
1063  const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1064  mutt_ch_check(s, ulen, fromcode, t);
1065  if (rc)
1066  {
1067  FREE(&t);
1068  FREE(&s);
1069  continue;
1070  }
1071  size_t slen = mutt_str_len(s);
1072 
1073  if (!tocode || (n < bestn))
1074  {
1075  bestn = n;
1076  FREE(&tocode);
1077  tocode = t;
1078  if (d)
1079  {
1080  FREE(&e);
1081  e = s;
1082  }
1083  else
1084  FREE(&s);
1085  elen = slen;
1086  }
1087  else
1088  {
1089  FREE(&t);
1090  FREE(&s);
1091  }
1092  }
1093  if (tocode)
1094  {
1095  if (d)
1096  *d = e;
1097  if (dlen)
1098  *dlen = elen;
1099 
1100  char canonical_buf[1024];
1101  mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1102  mutt_str_replace(&tocode, canonical_buf);
1103  }
1104  return tocode;
1105 }
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:758
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:548
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:721
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
char * p
Definition: charset.h:46
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:631
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:446
#define FREE(x)
Definition: memory.h:40
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:354
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ ReplacementChar

wchar_t ReplacementChar = '?'

When a Unicode character can't be displayed, use this instead.

Definition at line 57 of file charset.c.

◆ CharsetIsUtf8

bool CharsetIsUtf8 = false

Is the user's current character set utf-8?

Definition at line 62 of file charset.c.

◆ Lookups

struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
static

Definition at line 78 of file charset.c.

◆ PreferredMimeNames

const struct MimeNames PreferredMimeNames[]

Lookup table of preferred charsets.

The following list has been created manually from the data under: http://www.isi.edu/in-notes/iana/assignments/character-sets Last update: 2000-09-07

Note
It includes only the subset of character sets for which a preferred MIME name is given.

Definition at line 100 of file charset.c.