NeoMutt  2024-02-01-23-g345d7b
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
charset.c File Reference

Conversion between different character encodings. More...

#include "config.h"
#include <ctype.h>
#include <errno.h>
#include <iconv.h>
#include <langinfo.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include "charset.h"
#include "buffer.h"
#include "list.h"
#include "logging2.h"
#include "memory.h"
#include "queue.h"
#include "regex3.h"
#include "slist.h"
#include "string2.h"
#include <libintl.h>
+ Include dependency graph for charset.c:

Go to the source code of this file.

Data Structures

struct  Lookup
 Regex to String lookup table. More...
 
struct  IconvCacheEntry
 Cached iconv conversion descriptor. More...
 
struct  MimeNames
 MIME name lookup entry. More...
 

Macros

#define EILSEQ   EINVAL
 
#define ICONV_CACHE_SIZE   16
 Max size of the iconv cache.
 

Functions

 TAILQ_HEAD (LookupList, Lookup)
 
static struct Lookuplookup_new (void)
 Create a new Lookup.
 
static void lookup_free (struct Lookup **ptr)
 Free a Lookup.
 
static const char * lookup_charset (enum LookupType type, const char *cs)
 Look for a preferred character set name.
 
int mutt_ch_convert_nonmime_string (const struct Slist *const assumed_charset, const char *charset, char **ps)
 Try to convert a string using a list of character sets.
 
void mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name)
 Canonicalise the charset of a string.
 
bool mutt_ch_chscmp (const char *cs1, const char *cs2)
 Are the names of two character sets equivalent?
 
const char * mutt_ch_get_default_charset (const struct Slist *const assumed_charset)
 Get the default character set.
 
char * mutt_ch_get_langinfo_charset (void)
 Get the user's choice of character set.
 
bool mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
 Add a new character set lookup.
 
void mutt_ch_lookup_remove (void)
 Remove all the character set lookups.
 
const char * mutt_ch_charset_lookup (const char *chs)
 Look for a replacement character set.
 
iconv_t mutt_ch_iconv_open (const char *tocode, const char *fromcode, uint8_t flags)
 Set up iconv for conversions.
 
size_t mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
 Change the encoding of a string.
 
const char * mutt_ch_iconv_lookup (const char *chs)
 Look for a replacement character set.
 
int mutt_ch_check (const char *s, size_t slen, const char *from, const char *to)
 Check whether a string can be converted between encodings.
 
int mutt_ch_convert_string (char **ps, const char *from, const char *to, uint8_t flags)
 Convert a string between encodings.
 
bool mutt_ch_check_charset (const char *cs, bool strict)
 Does iconv understand a character set?
 
struct FgetConvmutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, uint8_t flags)
 Prepare a file for charset conversion.
 
void mutt_ch_fgetconv_close (struct FgetConv **ptr)
 Close an fgetconv handle.
 
int mutt_ch_fgetconv (struct FgetConv *fc)
 Convert a file's character set.
 
char * mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc)
 Convert a file's charset into a string buffer.
 
void mutt_ch_set_charset (const char *charset)
 Update the records for a new character set.
 
char * mutt_ch_choose (const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
 Figure the best charset to encode a string.
 
void mutt_ch_cache_cleanup (void)
 Clean up the cached iconv handles and charset strings.
 

Variables

wchar_t ReplacementChar = '?'
 When a Unicode character can't be displayed, use this instead.
 
bool CharsetIsUtf8 = false
 Is the user's current character set utf-8?
 
static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
 Lookup table of preferred character set names.
 
static struct IconvCacheEntry IconvCache [ICONV_CACHE_SIZE]
 Cache of iconv conversion descriptors.
 
static int IconvCacheUsed = 0
 Number of iconv descriptors in the cache.
 
static const struct MimeNames PreferredMimeNames []
 Lookup table of preferred charsets.
 

Detailed Description

Conversion between different character encodings.

Authors
  • Tobias Angele
  • Richard Russon
  • Pietro Cerutti
  • Steinar H Gunderson

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.c.

Macro Definition Documentation

◆ EILSEQ

#define EILSEQ   EINVAL

Definition at line 55 of file charset.c.

◆ ICONV_CACHE_SIZE

#define ICONV_CACHE_SIZE   16

Max size of the iconv cache.

Definition at line 96 of file charset.c.

Function Documentation

◆ TAILQ_HEAD()

TAILQ_HEAD ( LookupList  ,
Lookup   
)

◆ lookup_new()

static struct Lookup * lookup_new ( void  )
static

Create a new Lookup.

Return values
ptrNew Lookup

Definition at line 269 of file charset.c.

270{
271 return mutt_mem_calloc(1, sizeof(struct Lookup));
272}
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
Regex to String lookup table.
Definition: charset.c:74
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ lookup_free()

static void lookup_free ( struct Lookup **  ptr)
static

Free a Lookup.

Parameters
ptrLookup to free

Definition at line 278 of file charset.c.

279{
280 if (!ptr || !*ptr)
281 return;
282
283 struct Lookup *l = *ptr;
284 FREE(&l->replacement);
285 FREE(&l->regex.pattern);
286 if (l->regex.regex)
287 regfree(l->regex.regex);
288 FREE(&l->regex.regex);
289 FREE(&l->regex);
290
291 FREE(ptr);
292}
#define FREE(x)
Definition: memory.h:45
char * replacement
Alternative charset to use.
Definition: charset.c:77
struct Regex regex
Regular expression.
Definition: charset.c:76
char * pattern
printable version
Definition: regex3.h:86
regex_t * regex
compiled expression
Definition: regex3.h:87
+ Here is the caller graph for this function:

◆ lookup_charset()

static const char * lookup_charset ( enum LookupType  type,
const char *  cs 
)
static

Look for a preferred character set name.

Parameters
typeType, e.g. MUTT_LOOKUP_CHARSET
csCharacter set
Return values
ptrCharset string

If the character set matches one of the regexes, then return the replacement name.

Definition at line 303 of file charset.c.

304{
305 if (!cs)
306 return NULL;
307
308 struct Lookup *l = NULL;
309
310 TAILQ_FOREACH(l, &Lookups, entries)
311 {
312 if (l->type != type)
313 continue;
314 if (mutt_regex_match(&l->regex, cs))
315 return l->replacement;
316 }
317 return NULL;
318}
static struct LookupList Lookups
Lookup table of preferred character set names.
Definition: charset.c:83
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:639
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:725
enum LookupType type
Lookup type.
Definition: charset.c:75
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string ( const struct Slist *const  assumed_charset,
const char *  charset,
char **  ps 
)

Try to convert a string using a list of character sets.

Parameters
[in]assumed_charsetFrom $assumed_charset
[in]charsetFrom $charset
[in,out]psString to be converted
Return values
0Success
-1Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 331 of file charset.c.

333{
334 if (!ps)
335 return -1;
336
337 char *u = *ps;
338 const size_t ulen = mutt_str_len(u);
339 if (ulen == 0)
340 return 0;
341
342 const struct ListNode *np = NULL;
343 STAILQ_FOREACH(np, &assumed_charset->head, entries)
344 {
345 char const *c = np->data;
346 size_t n = mutt_str_len(c);
347 char *fromcode = mutt_mem_malloc(n + 1);
348 mutt_str_copy(fromcode, c, n + 1);
349 char *s = mutt_strn_dup(u, ulen);
350 int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);
351 FREE(&fromcode);
352 if (m == 0)
353 {
354 FREE(ps);
355 *ps = s;
356 return 0;
357 }
358 FREE(&s);
359 }
361 charset, MUTT_ICONV_HOOK_FROM);
362 return -1;
363}
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:830
const char * mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
Get the default character set.
Definition: charset.c:464
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:73
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:72
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:429
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:545
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:630
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
A List node for strings.
Definition: list.h:35
char * data
String.
Definition: list.h:36
struct ListHead head
List containing values.
Definition: slist.h:38
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset ( char *  buf,
size_t  buflen,
const char *  name 
)

Canonicalise the charset of a string.

Parameters
bufBuffer for canonical character set name
buflenLength of buffer
nameName to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 374 of file charset.c.

375{
376 if (!buf || !name)
377 return;
378
379 char in[1024] = { 0 };
380 char scratch[1024 + 10] = { 0 };
381
382 mutt_str_copy(in, name, sizeof(in));
383 char *ext = strchr(in, '/');
384 if (ext)
385 *ext++ = '\0';
386
387 if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
388 {
389 mutt_str_copy(buf, "utf-8", buflen);
390 goto out;
391 }
392
393 /* catch some common iso-8859-something misspellings */
394 size_t plen;
395 if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
396 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
397 else if ((plen = mutt_istr_startswith(in, "8859-")))
398 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
399 else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
400 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
401 else if ((plen = mutt_istr_startswith(in, "iso8859-")))
402 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
403 else
404 mutt_str_copy(scratch, in, sizeof(scratch));
405
406 for (size_t i = 0; PreferredMimeNames[i].key; i++)
407 {
408 if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
409 {
410 mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
411 goto out;
412 }
413 }
414
415 mutt_str_copy(buf, scratch, buflen);
416
417 /* for cosmetics' sake, transform to lowercase. */
418 for (char *p = buf; *p; p++)
419 *p = tolower(*p);
420
421out:
422 if (ext && *ext)
423 {
424 mutt_str_cat(buf, buflen, "/");
425 mutt_str_cat(buf, buflen, ext);
426 }
427}
static const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:121
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:721
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:242
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:268
const char * key
Definition: charset.c:107
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp ( const char *  cs1,
const char *  cs2 
)

Are the names of two character sets equivalent?

Parameters
cs1First character set
cs2Second character set
Return values
trueNames are equivalent
falseNames differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 441 of file charset.c.

442{
443 if (!cs1 || !cs2)
444 return false;
445
446 char buf[256] = { 0 };
447
448 mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
449
450 int len1 = mutt_str_len(buf);
451 int len2 = mutt_str_len(cs2);
452
453 return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
454 ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
455}
#define MIN(a, b)
Definition: memory.h:32
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:374
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:502
+ Here is the call graph for this function:

◆ mutt_ch_get_default_charset()

const char * mutt_ch_get_default_charset ( const struct Slist *const  assumed_charset)

Get the default character set.

Parameters
assumed_charsetFrom $assumed_charset
Return values
ptrName of the default character set
Warning
This returns a pointer to a static buffer. Do not free it.

Definition at line 464 of file charset.c.

465{
466 static char fcharset[128];
467 const char *c = NULL;
468
469 if (assumed_charset && (assumed_charset->count > 0))
470 c = STAILQ_FIRST(&assumed_charset->head)->data;
471 else
472 c = "us-ascii";
473
474 mutt_str_copy(fcharset, c, sizeof(fcharset));
475 return fcharset;
476}
#define STAILQ_FIRST(head)
Definition: queue.h:350
size_t count
Number of values in list.
Definition: slist.h:39
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char * mutt_ch_get_langinfo_charset ( void  )

Get the user's choice of character set.

Return values
ptrCharset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 485 of file charset.c.

486{
487 char buf[1024] = { 0 };
488
489 mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
490
491 if (buf[0] != '\0')
492 return mutt_str_dup(buf);
493
494 return mutt_str_dup("iso-8859-1");
495}
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:253
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add ( enum LookupType  type,
const char *  pat,
const char *  replace,
struct Buffer err 
)

Add a new character set lookup.

Parameters
typeType of character set, e.g. MUTT_LOOKUP_CHARSET
patPattern to match
replaceReplacement string
errBuffer for error message
Return values
trueLookup added to list
falseRegex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 508 of file charset.c.

510{
511 if (!pat || !replace)
512 return false;
513
514 regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
515 int rc = REG_COMP(rx, pat, REG_ICASE);
516 if (rc != 0)
517 {
518 regerror(rc, rx, err->data, err->dsize);
519 FREE(&rx);
520 return false;
521 }
522
523 struct Lookup *l = lookup_new();
524 l->type = type;
525 l->replacement = mutt_str_dup(replace);
526 l->regex.pattern = mutt_str_dup(pat);
527 l->regex.regex = rx;
528 l->regex.pat_not = false;
529
530 TAILQ_INSERT_TAIL(&Lookups, l, entries);
531
532 return true;
533}
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:269
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:49
size_t dsize
Length of data.
Definition: buffer.h:39
char * data
Pointer to data.
Definition: buffer.h:37
bool pat_not
do not match
Definition: regex3.h:88
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void  )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 540 of file charset.c.

541{
542 struct Lookup *l = NULL;
543 struct Lookup *tmp = NULL;
544
545 TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
546 {
547 TAILQ_REMOVE(&Lookups, l, entries);
548 lookup_free(&l);
549 }
550}
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:278
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char * mutt_ch_charset_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'charset-hook' matches)
NULLNo matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 561 of file charset.c.

562{
564}
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:303
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:68
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open ( const char *  tocode,
const char *  fromcode,
uint8_t  flags 
)

Set up iconv for conversions.

Parameters
tocodeCurrent character set
fromcodeTarget character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptriconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Since calling iconv_open() repeatedly can be expensive, we keep a cache of the most recently used iconv_t objects, kept in LRU order. This means that you should not call iconv_close() on the object yourself. All remaining objects in the cache will exit when main() calls mutt_ch_cache_cleanup().

Note
By design charset-hooks should never be, and are never, applied to tocode.
The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 593 of file charset.c.

594{
595 char tocode1[128] = { 0 };
596 char fromcode1[128] = { 0 };
597 const char *tocode2 = NULL, *fromcode2 = NULL;
598 const char *tmp = NULL;
599
600 /* transform to MIME preferred charset names */
601 mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
602 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
603
604 /* maybe apply charset-hooks and recanonicalise fromcode,
605 * but only when caller asked us to sanitize a potentially wrong
606 * charset name incoming from the wild exterior. */
607 if (flags & MUTT_ICONV_HOOK_FROM)
608 {
609 tmp = mutt_ch_charset_lookup(fromcode1);
610 if (tmp)
611 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
612 }
613
614 /* check if we have this pair cached already */
615 for (int i = 0; i < IconvCacheUsed; ++i)
616 {
617 if (strcmp(tocode1, IconvCache[i].tocode1) == 0 &&
618 strcmp(fromcode1, IconvCache[i].fromcode1) == 0)
619 {
620 iconv_t cd = IconvCache[i].cd;
621
622 /* make room for this one at the top */
623 struct IconvCacheEntry top = IconvCache[i];
624 for (int j = i; j-- > 0;)
625 {
626 IconvCache[j + 1] = IconvCache[j];
627 }
628 IconvCache[0] = top;
629
630 if (iconv_t_valid(cd))
631 {
632 /* reset state */
633 iconv(cd, NULL, NULL, NULL, NULL);
634 }
635 return cd;
636 }
637 }
638
639 /* not found in cache */
640 /* always apply iconv-hooks to suit system's iconv tastes */
641 tocode2 = mutt_ch_iconv_lookup(tocode1);
642 tocode2 = tocode2 ? tocode2 : tocode1;
643 fromcode2 = mutt_ch_iconv_lookup(fromcode1);
644 fromcode2 = fromcode2 ? fromcode2 : fromcode1;
645
646 /* call system iconv with names it appreciates */
647 iconv_t cd = iconv_open(tocode2, fromcode2);
648
650 {
651 mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n",
654 /* get rid of the oldest entry */
658 {
659 iconv_close(IconvCache[IconvCacheUsed - 1].cd);
660 }
662 }
663
664 /* make room for this one at the top */
665 for (int j = IconvCacheUsed; j-- > 0;)
666 {
667 IconvCache[j + 1] = IconvCache[j];
668 }
669
671
672 mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1);
673 IconvCache[0].fromcode1 = strdup(fromcode1);
674 IconvCache[0].tocode1 = strdup(tocode1);
675 IconvCache[0].cd = cd;
676
677 return cd;
678}
#define mutt_debug(LEVEL,...)
Definition: logging2.h:89
@ LL_DEBUG2
Log at debug level 2.
Definition: logging2.h:44
static int IconvCacheUsed
Number of iconv descriptors in the cache.
Definition: charset.c:100
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:780
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:561
#define ICONV_CACHE_SIZE
Max size of the iconv cache.
Definition: charset.c:96
static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
Cache of iconv conversion descriptors.
Definition: charset.c:98
static bool iconv_t_valid(const iconv_t cd)
Is the conversion descriptor valid?
Definition: charset.h:113
Cached iconv conversion descriptor.
Definition: charset.c:89
char * tocode1
Destination character set.
Definition: charset.c:91
char * fromcode1
Source character set.
Definition: charset.c:90
iconv_t cd
iconv conversion descriptor
Definition: charset.c:92
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv ( iconv_t  cd,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft,
const char **  inrepls,
const char *  outrepl,
int *  iconverrno 
)

Change the encoding of a string.

Parameters
[in]cdIconv conversion descriptor
[in,out]inbufBuffer to convert
[in,out]inbytesleftLength of buffer to convert
[in,out]outbufBuffer for the result
[in,out]outbytesleftLength of result buffer
[in]inreplsInput replacement characters
[in]outreplOutput replacement characters
[out]iconverrnoErrno if iconv() fails, 0 if it succeeds
Return values
numCharacters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 696 of file charset.c.

699{
700 size_t rc = 0;
701 const char *ib = *inbuf;
702 size_t ibl = *inbytesleft;
703 char *ob = *outbuf;
704 size_t obl = *outbytesleft;
705
706 while (true)
707 {
708 errno = 0;
709 const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
710 if (ret1 != ICONV_ILLEGAL_SEQ)
711 rc += ret1;
712 if (iconverrno)
713 *iconverrno = errno;
714
715 if (ibl && obl && (errno == EILSEQ))
716 {
717 if (inrepls)
718 {
719 /* Try replacing the input */
720 const char **t = NULL;
721 for (t = inrepls; *t; t++)
722 {
723 const char *ib1 = *t;
724 size_t ibl1 = strlen(*t);
725 char *ob1 = ob;
726 size_t obl1 = obl;
727 iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
728 if (ibl1 == 0)
729 {
730 ib++;
731 ibl--;
732 ob = ob1;
733 obl = obl1;
734 rc++;
735 break;
736 }
737 }
738 if (*t)
739 continue;
740 }
741 /* Replace the output */
742 if (!outrepl)
743 outrepl = "?";
744 iconv(cd, NULL, NULL, &ob, &obl);
745 if (obl)
746 {
747 int n = strlen(outrepl);
748 if (n > obl)
749 {
750 outrepl = "?";
751 n = 1;
752 }
753 memcpy(ob, outrepl, n);
754 ib++;
755 ibl--;
756 ob += n;
757 obl -= n;
758 rc++;
759 iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
760 continue;
761 }
762 }
763 *inbuf = ib;
764 *inbytesleft = ibl;
765 *outbuf = ob;
766 *outbytesleft = obl;
767 return rc;
768 }
769}
#define EILSEQ
Definition: charset.c:55
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition: charset.h:104
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char * mutt_ch_iconv_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'iconv-hook' matches)
NULLNo matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 780 of file charset.c.

781{
783}
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:69
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check ( const char *  s,
size_t  slen,
const char *  from,
const char *  to 
)

Check whether a string can be converted between encodings.

Parameters
[in]sString to check
[in]slenLength of the string to check
[in]fromCurrent character set
[in]toTarget character set
Return values
0Success
-1Error in iconv_open()
>0Errno as set by iconv()

Definition at line 795 of file charset.c.

796{
797 if (!s || !from || !to)
798 return -1;
799
800 int rc = 0;
801 iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
802 if (!iconv_t_valid(cd))
803 return -1;
804
805 size_t outlen = MB_LEN_MAX * slen;
806 char *out = mutt_mem_malloc(outlen + 1);
807 char *saved_out = out;
808
809 const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
810 if (convlen == ICONV_ILLEGAL_SEQ)
811 rc = errno;
812
813 FREE(&saved_out);
814 return rc;
815}
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:593
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string ( char **  ps,
const char *  from,
const char *  to,
uint8_t  flags 
)

Convert a string between encodings.

Parameters
[in,out]psString to convert
[in]fromCurrent character set
[in]toTarget character set
[in]flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
0Success
-1Invalid arguments or failure to open an iconv channel
errnoFailure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 830 of file charset.c.

831{
832 if (!ps)
833 return -1;
834
835 char *s = *ps;
836
837 if (!s || (*s == '\0'))
838 return 0;
839
840 if (!to || !from)
841 return -1;
842
843 const char *repls[] = { "\357\277\275", "?", 0 };
844 int rc = 0;
845
846 iconv_t cd = mutt_ch_iconv_open(to, from, flags);
847 if (!iconv_t_valid(cd))
848 return -1;
849
850 const char **inrepls = NULL;
851 const char *outrepl = NULL;
852
853 if (mutt_ch_is_utf8(to))
854 outrepl = "\357\277\275";
855 else if (mutt_ch_is_utf8(from))
856 inrepls = repls;
857 else
858 outrepl = "?";
859
860 const char *ib = s;
861 size_t ibl = strlen(s);
862 if (ibl >= (SIZE_MAX / MB_LEN_MAX))
863 {
864 return -1;
865 }
866 size_t obl = MB_LEN_MAX * ibl;
867 char *buf = mutt_mem_malloc(obl + 1);
868 char *ob = buf;
869
870 mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
871 iconv(cd, 0, 0, &ob, &obl);
872
873 *ob = '\0';
874
875 FREE(ps);
876 *ps = buf;
877
878 mutt_str_adjust(ps);
879 return rc;
880}
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:696
#define mutt_ch_is_utf8(str)
Definition: charset.h:97
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:348
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset ( const char *  cs,
bool  strict 
)

Does iconv understand a character set?

Parameters
csCharacter set to check
strictCheck strictly by using iconv
Return values
trueCharacter set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 893 of file charset.c.

894{
895 if (!cs)
896 return false;
897
898 if (mutt_ch_is_utf8(cs))
899 return true;
900
901 if (!strict)
902 {
903 for (int i = 0; PreferredMimeNames[i].key; i++)
904 {
905 if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
907 {
908 return true;
909 }
910 }
911 }
912
913 iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
914 if (iconv_t_valid(cd))
915 {
916 return true;
917 }
918
919 return false;
920}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv * mutt_ch_fgetconv_open ( FILE *  fp,
const char *  from,
const char *  to,
uint8_t  flags 
)

Prepare a file for charset conversion.

Parameters
fpFILE ptr to prepare
fromCurrent character set
toDestination character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptrfgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 932 of file charset.c.

933{
934 struct FgetConv *fc = NULL;
935 iconv_t cd = ICONV_T_INVALID;
936
937 if (from && to)
938 cd = mutt_ch_iconv_open(to, from, flags);
939
940 if (iconv_t_valid(cd))
941 {
942 static const char *repls[] = { "\357\277\275", "?", 0 };
943
944 fc = mutt_mem_malloc(sizeof(struct FgetConv));
945 fc->p = fc->bufo;
946 fc->ob = fc->bufo;
947 fc->ib = fc->bufi;
948 fc->ibl = 0;
949 fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
950 }
951 else
952 {
953 fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
954 }
955 fc->fp = fp;
956 fc->cd = cd;
957 return fc;
958}
#define ICONV_T_INVALID
Error value for iconv functions.
Definition: charset.h:101
A dummy converter.
Definition: charset.h:58
Cursor for converting a file's encoding.
Definition: charset.h:42
char bufi[512]
Definition: charset.h:45
iconv_t cd
iconv conversion descriptor
Definition: charset.h:44
char bufo[512]
Definition: charset.h:46
size_t ibl
Definition: charset.h:50
FILE * fp
Definition: charset.h:43
char * p
Definition: charset.h:47
const char ** inrepls
Definition: charset.h:51
char * ib
Definition: charset.h:49
char * ob
Definition: charset.h:48
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv **  ptr)

Close an fgetconv handle.

Parameters
[out]ptrfgetconv handle

Definition at line 964 of file charset.c.

965{
966 if (!ptr || !*ptr)
967 return;
968
969 FREE(ptr);
970}
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv fc)

Convert a file's character set.

Parameters
fcFgetConv handle
Return values
numNext character in the converted file
EOFError

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 982 of file charset.c.

983{
984 if (!fc)
985 return EOF;
986 if (!iconv_t_valid(fc->cd))
987 return fgetc(fc->fp);
988 if (!fc->p)
989 return EOF;
990 if (fc->p < fc->ob)
991 return (unsigned char) *(fc->p)++;
992
993 /* Try to convert some more */
994 fc->p = fc->bufo;
995 fc->ob = fc->bufo;
996 if (fc->ibl)
997 {
998 size_t obl = sizeof(fc->bufo);
999 iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
1000 if (fc->p < fc->ob)
1001 return (unsigned char) *(fc->p)++;
1002 }
1003
1004 /* If we trusted iconv a bit more, we would at this point
1005 * ask why it had stopped converting ... */
1006
1007 /* Try to read some more */
1008 if ((fc->ibl == sizeof(fc->bufi)) ||
1009 (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
1010 {
1011 fc->p = 0;
1012 return EOF;
1013 }
1014 if (fc->ibl)
1015 memcpy(fc->bufi, fc->ib, fc->ibl);
1016 fc->ib = fc->bufi;
1017 fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
1018
1019 /* Try harder this time to convert some */
1020 if (fc->ibl)
1021 {
1022 size_t obl = sizeof(fc->bufo);
1023 mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
1024 fc->inrepls, 0, NULL);
1025 if (fc->p < fc->ob)
1026 return (unsigned char) *(fc->p)++;
1027 }
1028
1029 /* Either the file has finished or one of the buffers is too small */
1030 fc->p = 0;
1031 return EOF;
1032}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char * mutt_ch_fgetconvs ( char *  buf,
size_t  buflen,
struct FgetConv fc 
)

Convert a file's charset into a string buffer.

Parameters
bufBuffer for result
buflenLength of buffer
fcFgetConv handle
Return values
ptrSuccess, result buffer
NULLError

Read a file into a buffer, converting the character set as it goes.

Definition at line 1044 of file charset.c.

1045{
1046 if (!buf)
1047 return NULL;
1048
1049 size_t r;
1050 for (r = 0; (r + 1) < buflen;)
1051 {
1052 const int c = mutt_ch_fgetconv(fc);
1053 if (c == EOF)
1054 break;
1055 buf[r++] = (char) c;
1056 if (c == '\n')
1057 break;
1058 }
1059 buf[r] = '\0';
1060
1061 if (r > 0)
1062 return buf;
1063
1064 return NULL;
1065}
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:982
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char *  charset)

Update the records for a new character set.

Parameters
charsetNew character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note
This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1077 of file charset.c.

1078{
1079 char buf[256] = { 0 };
1080
1081 mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1082
1083 if (mutt_ch_is_utf8(buf))
1084 {
1085 CharsetIsUtf8 = true;
1086 ReplacementChar = 0xfffd; /* replacement character */
1087 }
1088 else
1089 {
1090 CharsetIsUtf8 = false;
1091 ReplacementChar = '?';
1092 }
1093
1094#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1095 bind_textdomain_codeset(PACKAGE, buf);
1096#endif
1097}
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:66
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:61
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_choose()

char * mutt_ch_choose ( const char *  fromcode,
const struct Slist charsets,
const char *  u,
size_t  ulen,
char **  d,
size_t *  dlen 
)

Figure the best charset to encode a string.

Parameters
[in]fromcodeOriginal charset of the string
[in]charsetsList of potential charsets to use
[in]uString to encode
[in]ulenLength of the string to encode
[out]dIf not NULL, point it to the converted string
[out]dlenIf not NULL, point it to the length of the d string
Return values
ptrBest performing charset
NULLNone could be found

Definition at line 1110 of file charset.c.

1112{
1113 if (!fromcode || !charsets)
1114 return NULL;
1115
1116 char *e = NULL, *tocode = NULL;
1117 size_t elen = 0, bestn = 0;
1118
1119 const struct ListNode *np = NULL;
1120 STAILQ_FOREACH(np, &charsets->head, entries)
1121 {
1122 char *t = mutt_str_dup(np->data);
1123 if (!t)
1124 continue;
1125
1126 size_t n = mutt_str_len(t);
1127 char *s = mutt_strn_dup(u, ulen);
1128 const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1129 mutt_ch_check(s, ulen, fromcode, t);
1130 if (rc)
1131 {
1132 FREE(&t);
1133 FREE(&s);
1134 continue;
1135 }
1136 size_t slen = mutt_str_len(s);
1137
1138 if (!tocode || (n < bestn))
1139 {
1140 bestn = n;
1141 FREE(&tocode);
1142 tocode = t;
1143 if (d)
1144 {
1145 FREE(&e);
1146 e = s;
1147 }
1148 else
1149 {
1150 FREE(&s);
1151 }
1152 elen = slen;
1153 }
1154 else
1155 {
1156 FREE(&t);
1157 FREE(&s);
1158 }
1159 }
1160 if (tocode)
1161 {
1162 if (d)
1163 *d = e;
1164 if (dlen)
1165 *dlen = elen;
1166
1167 char canonical_buf[1024] = { 0 };
1168 mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1169 mutt_str_replace(&tocode, canonical_buf);
1170 }
1171 return tocode;
1172}
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:795
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:329
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_cache_cleanup()

void mutt_ch_cache_cleanup ( void  )

Clean up the cached iconv handles and charset strings.

Definition at line 1177 of file charset.c.

1178{
1179 for (int i = 0; i < IconvCacheUsed; ++i)
1180 {
1181 FREE(&IconvCache[i].fromcode1);
1182 FREE(&IconvCache[i].tocode1);
1183 if (iconv_t_valid(IconvCache[i].cd))
1184 {
1185 iconv_close(IconvCache[i].cd);
1186 }
1187 }
1188 IconvCacheUsed = 0;
1189}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ ReplacementChar

wchar_t ReplacementChar = '?'

When a Unicode character can't be displayed, use this instead.

Definition at line 61 of file charset.c.

◆ CharsetIsUtf8

bool CharsetIsUtf8 = false

Is the user's current character set utf-8?

Definition at line 66 of file charset.c.

◆ Lookups

struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
static

Lookup table of preferred character set names.

Definition at line 83 of file charset.c.

◆ IconvCache

struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
static

Cache of iconv conversion descriptors.

Definition at line 98 of file charset.c.

◆ IconvCacheUsed

int IconvCacheUsed = 0
static

Number of iconv descriptors in the cache.

Definition at line 100 of file charset.c.

◆ PreferredMimeNames

const struct MimeNames PreferredMimeNames[]
static

Lookup table of preferred charsets.

The following list has been created manually from the data under: http://www.isi.edu/in-notes/iana/assignments/character-sets Last update: 2000-09-07

Note
It includes only the subset of character sets for which a preferred MIME name is given.

Definition at line 121 of file charset.c.