NeoMutt  2024-04-16-36-g75b6fb
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
charset.c File Reference

Conversion between different character encodings. More...

#include "config.h"
#include <errno.h>
#include <iconv.h>
#include <langinfo.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include "charset.h"
#include "buffer.h"
#include "list.h"
#include "logging2.h"
#include "memory.h"
#include "pool.h"
#include "queue.h"
#include "regex3.h"
#include "slist.h"
#include "string2.h"
#include <libintl.h>
+ Include dependency graph for charset.c:

Go to the source code of this file.

Data Structures

struct  Lookup
 Regex to String lookup table. More...
 
struct  IconvCacheEntry
 Cached iconv conversion descriptor. More...
 
struct  MimeNames
 MIME name lookup entry. More...
 

Macros

#define EILSEQ   EINVAL
 
#define ICONV_CACHE_SIZE   16
 Max size of the iconv cache.
 

Functions

 TAILQ_HEAD (LookupList, Lookup)
 
static struct Lookuplookup_new (void)
 Create a new Lookup.
 
static void lookup_free (struct Lookup **ptr)
 Free a Lookup.
 
static const char * lookup_charset (enum LookupType type, const char *cs)
 Look for a preferred character set name.
 
int mutt_ch_convert_nonmime_string (const struct Slist *const assumed_charset, const char *charset, char **ps)
 Try to convert a string using a list of character sets.
 
void mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name)
 Canonicalise the charset of a string.
 
bool mutt_ch_chscmp (const char *cs1, const char *cs2)
 Are the names of two character sets equivalent?
 
const char * mutt_ch_get_default_charset (const struct Slist *const assumed_charset)
 Get the default character set.
 
char * mutt_ch_get_langinfo_charset (void)
 Get the user's choice of character set.
 
bool mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
 Add a new character set lookup.
 
void mutt_ch_lookup_remove (void)
 Remove all the character set lookups.
 
const char * mutt_ch_charset_lookup (const char *chs)
 Look for a replacement character set.
 
iconv_t mutt_ch_iconv_open (const char *tocode, const char *fromcode, uint8_t flags)
 Set up iconv for conversions.
 
size_t mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
 Change the encoding of a string.
 
const char * mutt_ch_iconv_lookup (const char *chs)
 Look for a replacement character set.
 
int mutt_ch_check (const char *s, size_t slen, const char *from, const char *to)
 Check whether a string can be converted between encodings.
 
int mutt_ch_convert_string (char **ps, const char *from, const char *to, uint8_t flags)
 Convert a string between encodings.
 
bool mutt_ch_check_charset (const char *cs, bool strict)
 Does iconv understand a character set?
 
struct FgetConvmutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, uint8_t flags)
 Prepare a file for charset conversion.
 
void mutt_ch_fgetconv_close (struct FgetConv **ptr)
 Close an fgetconv handle.
 
int mutt_ch_fgetconv (struct FgetConv *fc)
 Convert a file's character set.
 
char * mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc)
 Convert a file's charset into a string buffer.
 
void mutt_ch_set_charset (const char *charset)
 Update the records for a new character set.
 
char * mutt_ch_choose (const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
 Figure the best charset to encode a string.
 
void mutt_ch_cache_cleanup (void)
 Clean up the cached iconv handles and charset strings.
 

Variables

wchar_t ReplacementChar = '?'
 When a Unicode character can't be displayed, use this instead.
 
bool CharsetIsUtf8 = false
 Is the user's current character set utf-8?
 
static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
 Lookup table of preferred character set names.
 
static struct IconvCacheEntry IconvCache [ICONV_CACHE_SIZE]
 Cache of iconv conversion descriptors.
 
static int IconvCacheUsed = 0
 Number of iconv descriptors in the cache.
 
static const struct MimeNames PreferredMimeNames []
 Lookup table of preferred charsets.
 

Detailed Description

Conversion between different character encodings.

Authors
  • Tobias Angele
  • Richard Russon
  • Pietro Cerutti
  • Steinar H Gunderson

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.c.

Macro Definition Documentation

◆ EILSEQ

#define EILSEQ   EINVAL

Definition at line 55 of file charset.c.

◆ ICONV_CACHE_SIZE

#define ICONV_CACHE_SIZE   16

Max size of the iconv cache.

Definition at line 96 of file charset.c.

Function Documentation

◆ TAILQ_HEAD()

TAILQ_HEAD ( LookupList  ,
Lookup   
)

◆ lookup_new()

static struct Lookup * lookup_new ( void  )
static

Create a new Lookup.

Return values
ptrNew Lookup

Definition at line 269 of file charset.c.

270{
271 return mutt_mem_calloc(1, sizeof(struct Lookup));
272}
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
Regex to String lookup table.
Definition: charset.c:74
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ lookup_free()

static void lookup_free ( struct Lookup **  ptr)
static

Free a Lookup.

Parameters
ptrLookup to free

Definition at line 278 of file charset.c.

279{
280 if (!ptr || !*ptr)
281 return;
282
283 struct Lookup *l = *ptr;
284 FREE(&l->replacement);
285 FREE(&l->regex.pattern);
286 if (l->regex.regex)
287 regfree(l->regex.regex);
288 FREE(&l->regex.regex);
289 FREE(&l->regex);
290
291 FREE(ptr);
292}
#define FREE(x)
Definition: memory.h:45
char * replacement
Alternative charset to use.
Definition: charset.c:77
struct Regex regex
Regular expression.
Definition: charset.c:76
char * pattern
printable version
Definition: regex3.h:86
regex_t * regex
compiled expression
Definition: regex3.h:87
+ Here is the caller graph for this function:

◆ lookup_charset()

static const char * lookup_charset ( enum LookupType  type,
const char *  cs 
)
static

Look for a preferred character set name.

Parameters
typeType, e.g. MUTT_LOOKUP_CHARSET
csCharacter set
Return values
ptrCharset string

If the character set matches one of the regexes, then return the replacement name.

Definition at line 303 of file charset.c.

304{
305 if (!cs)
306 return NULL;
307
308 struct Lookup *l = NULL;
309
310 TAILQ_FOREACH(l, &Lookups, entries)
311 {
312 if (l->type != type)
313 continue;
314 if (mutt_regex_match(&l->regex, cs))
315 return l->replacement;
316 }
317 return NULL;
318}
static struct LookupList Lookups
Lookup table of preferred character set names.
Definition: charset.c:83
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:614
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:725
enum LookupType type
Lookup type.
Definition: charset.c:75
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string ( const struct Slist *const  assumed_charset,
const char *  charset,
char **  ps 
)

Try to convert a string using a list of character sets.

Parameters
[in]assumed_charsetFrom $assumed_charset
[in]charsetFrom $charset
[in,out]psString to be converted
Return values
0Success
-1Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 331 of file charset.c.

333{
334 if (!ps)
335 return -1;
336
337 char *u = *ps;
338 const size_t ulen = mutt_str_len(u);
339 if (ulen == 0)
340 return 0;
341
342 const struct ListNode *np = NULL;
343 STAILQ_FOREACH(np, &assumed_charset->head, entries)
344 {
345 char const *c = np->data;
346 size_t n = mutt_str_len(c);
347 char *fromcode = mutt_mem_malloc(n + 1);
348 mutt_str_copy(fromcode, c, n + 1);
349 char *s = mutt_strn_dup(u, ulen);
350 int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);
351 FREE(&fromcode);
352 if (m == 0)
353 {
354 FREE(ps);
355 *ps = s;
356 return 0;
357 }
358 FREE(&s);
359 }
361 charset, MUTT_ICONV_HOOK_FROM);
362 return -1;
363}
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:831
const char * mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
Get the default character set.
Definition: charset.c:465
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:73
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:72
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:374
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:490
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:575
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
A List node for strings.
Definition: list.h:35
char * data
String.
Definition: list.h:36
struct ListHead head
List containing values.
Definition: slist.h:38
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset ( char *  buf,
size_t  buflen,
const char *  name 
)

Canonicalise the charset of a string.

Parameters
bufBuffer for canonical character set name
buflenLength of buffer
nameName to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 374 of file charset.c.

375{
376 if (!buf || !name)
377 return;
378
379 char in[1024] = { 0 };
380 char scratch[1024 + 10] = { 0 };
381 struct Buffer *canon = buf_pool_get();
382
383 mutt_str_copy(in, name, sizeof(in));
384 char *ext = strchr(in, '/');
385 if (ext)
386 *ext++ = '\0';
387
388 if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
389 {
390 buf_strcpy(canon, "utf-8");
391 goto out;
392 }
393
394 /* catch some common iso-8859-something misspellings */
395 size_t plen;
396 if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
397 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
398 else if ((plen = mutt_istr_startswith(in, "8859-")))
399 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
400 else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
401 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
402 else if ((plen = mutt_istr_startswith(in, "iso8859-")))
403 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
404 else
405 mutt_str_copy(scratch, in, sizeof(scratch));
406
407 for (size_t i = 0; PreferredMimeNames[i].key; i++)
408 {
409 if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
410 {
411 buf_strcpy(canon, PreferredMimeNames[i].pref);
412 goto out;
413 }
414 }
415
416 buf_strcpy(canon, scratch);
417 buf_lower(canon); // for cosmetics' sake
418
419out:
420 if (ext && (*ext != '\0'))
421 {
422 buf_addch(canon, '/');
423 buf_addstr(canon, ext);
424 }
425
426 mutt_str_copy(buf, buf_string(canon), buflen);
427 buf_pool_release(&canon);
428}
size_t buf_addch(struct Buffer *buf, char c)
Add a single character to a Buffer.
Definition: buffer.c:240
size_t buf_addstr(struct Buffer *buf, const char *s)
Add a string to a Buffer.
Definition: buffer.c:225
size_t buf_strcpy(struct Buffer *buf, const char *s)
Copy a string into a Buffer.
Definition: buffer.c:394
void buf_lower(struct Buffer *buf)
Sets a buffer to lowercase.
Definition: buffer.c:735
static const char * buf_string(const struct Buffer *buf)
Convert a buffer to a const char * "string".
Definition: buffer.h:96
static const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:121
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:666
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:242
struct Buffer * buf_pool_get(void)
Get a Buffer from the pool.
Definition: pool.c:81
void buf_pool_release(struct Buffer **ptr)
Return a Buffer to the pool.
Definition: pool.c:94
String manipulation buffer.
Definition: buffer.h:36
const char * key
Definition: charset.c:107
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp ( const char *  cs1,
const char *  cs2 
)

Are the names of two character sets equivalent?

Parameters
cs1First character set
cs2Second character set
Return values
trueNames are equivalent
falseNames differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 442 of file charset.c.

443{
444 if (!cs1 || !cs2)
445 return false;
446
447 char buf[256] = { 0 };
448
449 mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
450
451 int len1 = mutt_str_len(buf);
452 int len2 = mutt_str_len(cs2);
453
454 return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
455 ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
456}
#define MIN(a, b)
Definition: memory.h:32
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:374
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:447
+ Here is the call graph for this function:

◆ mutt_ch_get_default_charset()

const char * mutt_ch_get_default_charset ( const struct Slist *const  assumed_charset)

Get the default character set.

Parameters
assumed_charsetFrom $assumed_charset
Return values
ptrName of the default character set
Warning
This returns a pointer to a static buffer. Do not free it.

Definition at line 465 of file charset.c.

466{
467 static char fcharset[128];
468 const char *c = NULL;
469
470 if (assumed_charset && (assumed_charset->count > 0))
471 c = STAILQ_FIRST(&assumed_charset->head)->data;
472 else
473 c = "us-ascii";
474
475 mutt_str_copy(fcharset, c, sizeof(fcharset));
476 return fcharset;
477}
#define STAILQ_FIRST(head)
Definition: queue.h:350
size_t count
Number of values in list.
Definition: slist.h:39
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char * mutt_ch_get_langinfo_charset ( void  )

Get the user's choice of character set.

Return values
ptrCharset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 486 of file charset.c.

487{
488 char buf[1024] = { 0 };
489
490 mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
491
492 if (buf[0] != '\0')
493 return mutt_str_dup(buf);
494
495 return mutt_str_dup("iso-8859-1");
496}
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:253
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add ( enum LookupType  type,
const char *  pat,
const char *  replace,
struct Buffer err 
)

Add a new character set lookup.

Parameters
typeType of character set, e.g. MUTT_LOOKUP_CHARSET
patPattern to match
replaceReplacement string
errBuffer for error message
Return values
trueLookup added to list
falseRegex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 509 of file charset.c.

511{
512 if (!pat || !replace)
513 return false;
514
515 regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
516 int rc = REG_COMP(rx, pat, REG_ICASE);
517 if (rc != 0)
518 {
519 regerror(rc, rx, err->data, err->dsize);
520 FREE(&rx);
521 return false;
522 }
523
524 struct Lookup *l = lookup_new();
525 l->type = type;
526 l->replacement = mutt_str_dup(replace);
527 l->regex.pattern = mutt_str_dup(pat);
528 l->regex.regex = rx;
529 l->regex.pat_not = false;
530
531 TAILQ_INSERT_TAIL(&Lookups, l, entries);
532
533 return true;
534}
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:269
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:49
size_t dsize
Length of data.
Definition: buffer.h:39
char * data
Pointer to data.
Definition: buffer.h:37
bool pat_not
do not match
Definition: regex3.h:88
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void  )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 541 of file charset.c.

542{
543 struct Lookup *l = NULL;
544 struct Lookup *tmp = NULL;
545
546 TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
547 {
548 TAILQ_REMOVE(&Lookups, l, entries);
549 lookup_free(&l);
550 }
551}
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:278
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char * mutt_ch_charset_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'charset-hook' matches)
NULLNo matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 562 of file charset.c.

563{
565}
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:303
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:68
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open ( const char *  tocode,
const char *  fromcode,
uint8_t  flags 
)

Set up iconv for conversions.

Parameters
tocodeCurrent character set
fromcodeTarget character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptriconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Since calling iconv_open() repeatedly can be expensive, we keep a cache of the most recently used iconv_t objects, kept in LRU order. This means that you should not call iconv_close() on the object yourself. All remaining objects in the cache will exit when main() calls mutt_ch_cache_cleanup().

Note
By design charset-hooks should never be, and are never, applied to tocode.
The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 594 of file charset.c.

595{
596 char tocode1[128] = { 0 };
597 char fromcode1[128] = { 0 };
598 const char *tocode2 = NULL, *fromcode2 = NULL;
599 const char *tmp = NULL;
600
601 /* transform to MIME preferred charset names */
602 mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
603 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
604
605 /* maybe apply charset-hooks and recanonicalise fromcode,
606 * but only when caller asked us to sanitize a potentially wrong
607 * charset name incoming from the wild exterior. */
608 if (flags & MUTT_ICONV_HOOK_FROM)
609 {
610 tmp = mutt_ch_charset_lookup(fromcode1);
611 if (tmp)
612 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
613 }
614
615 /* check if we have this pair cached already */
616 for (int i = 0; i < IconvCacheUsed; ++i)
617 {
618 if (strcmp(tocode1, IconvCache[i].tocode1) == 0 &&
619 strcmp(fromcode1, IconvCache[i].fromcode1) == 0)
620 {
621 iconv_t cd = IconvCache[i].cd;
622
623 /* make room for this one at the top */
624 struct IconvCacheEntry top = IconvCache[i];
625 for (int j = i; j-- > 0;)
626 {
627 IconvCache[j + 1] = IconvCache[j];
628 }
629 IconvCache[0] = top;
630
631 if (iconv_t_valid(cd))
632 {
633 /* reset state */
634 iconv(cd, NULL, NULL, NULL, NULL);
635 }
636 return cd;
637 }
638 }
639
640 /* not found in cache */
641 /* always apply iconv-hooks to suit system's iconv tastes */
642 tocode2 = mutt_ch_iconv_lookup(tocode1);
643 tocode2 = tocode2 ? tocode2 : tocode1;
644 fromcode2 = mutt_ch_iconv_lookup(fromcode1);
645 fromcode2 = fromcode2 ? fromcode2 : fromcode1;
646
647 /* call system iconv with names it appreciates */
648 iconv_t cd = iconv_open(tocode2, fromcode2);
649
651 {
652 mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n",
655 /* get rid of the oldest entry */
659 {
660 iconv_close(IconvCache[IconvCacheUsed - 1].cd);
661 }
663 }
664
665 /* make room for this one at the top */
666 for (int j = IconvCacheUsed; j-- > 0;)
667 {
668 IconvCache[j + 1] = IconvCache[j];
669 }
670
672
673 mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1);
674 IconvCache[0].fromcode1 = strdup(fromcode1);
675 IconvCache[0].tocode1 = strdup(tocode1);
676 IconvCache[0].cd = cd;
677
678 return cd;
679}
#define mutt_debug(LEVEL,...)
Definition: logging2.h:89
@ LL_DEBUG2
Log at debug level 2.
Definition: logging2.h:44
static int IconvCacheUsed
Number of iconv descriptors in the cache.
Definition: charset.c:100
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:781
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:562
#define ICONV_CACHE_SIZE
Max size of the iconv cache.
Definition: charset.c:96
static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
Cache of iconv conversion descriptors.
Definition: charset.c:98
static bool iconv_t_valid(const iconv_t cd)
Is the conversion descriptor valid?
Definition: charset.h:113
Cached iconv conversion descriptor.
Definition: charset.c:89
char * tocode1
Destination character set.
Definition: charset.c:91
char * fromcode1
Source character set.
Definition: charset.c:90
iconv_t cd
iconv conversion descriptor
Definition: charset.c:92
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv ( iconv_t  cd,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft,
const char **  inrepls,
const char *  outrepl,
int *  iconverrno 
)

Change the encoding of a string.

Parameters
[in]cdIconv conversion descriptor
[in,out]inbufBuffer to convert
[in,out]inbytesleftLength of buffer to convert
[in,out]outbufBuffer for the result
[in,out]outbytesleftLength of result buffer
[in]inreplsInput replacement characters
[in]outreplOutput replacement characters
[out]iconverrnoErrno if iconv() fails, 0 if it succeeds
Return values
numCharacters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 697 of file charset.c.

700{
701 size_t rc = 0;
702 const char *ib = *inbuf;
703 size_t ibl = *inbytesleft;
704 char *ob = *outbuf;
705 size_t obl = *outbytesleft;
706
707 while (true)
708 {
709 errno = 0;
710 const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
711 if (ret1 != ICONV_ILLEGAL_SEQ)
712 rc += ret1;
713 if (iconverrno)
714 *iconverrno = errno;
715
716 if (ibl && obl && (errno == EILSEQ))
717 {
718 if (inrepls)
719 {
720 /* Try replacing the input */
721 const char **t = NULL;
722 for (t = inrepls; *t; t++)
723 {
724 const char *ib1 = *t;
725 size_t ibl1 = strlen(*t);
726 char *ob1 = ob;
727 size_t obl1 = obl;
728 iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
729 if (ibl1 == 0)
730 {
731 ib++;
732 ibl--;
733 ob = ob1;
734 obl = obl1;
735 rc++;
736 break;
737 }
738 }
739 if (*t)
740 continue;
741 }
742 /* Replace the output */
743 if (!outrepl)
744 outrepl = "?";
745 iconv(cd, NULL, NULL, &ob, &obl);
746 if (obl)
747 {
748 int n = strlen(outrepl);
749 if (n > obl)
750 {
751 outrepl = "?";
752 n = 1;
753 }
754 memcpy(ob, outrepl, n);
755 ib++;
756 ibl--;
757 ob += n;
758 obl -= n;
759 rc++;
760 iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
761 continue;
762 }
763 }
764 *inbuf = ib;
765 *inbytesleft = ibl;
766 *outbuf = ob;
767 *outbytesleft = obl;
768 return rc;
769 }
770}
#define EILSEQ
Definition: charset.c:55
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition: charset.h:104
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char * mutt_ch_iconv_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'iconv-hook' matches)
NULLNo matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 781 of file charset.c.

782{
784}
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:69
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check ( const char *  s,
size_t  slen,
const char *  from,
const char *  to 
)

Check whether a string can be converted between encodings.

Parameters
[in]sString to check
[in]slenLength of the string to check
[in]fromCurrent character set
[in]toTarget character set
Return values
0Success
-1Error in iconv_open()
>0Errno as set by iconv()

Definition at line 796 of file charset.c.

797{
798 if (!s || !from || !to)
799 return -1;
800
801 int rc = 0;
802 iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
803 if (!iconv_t_valid(cd))
804 return -1;
805
806 size_t outlen = MB_LEN_MAX * slen;
807 char *out = mutt_mem_malloc(outlen + 1);
808 char *saved_out = out;
809
810 const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
811 if (convlen == ICONV_ILLEGAL_SEQ)
812 rc = errno;
813
814 FREE(&saved_out);
815 return rc;
816}
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:594
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string ( char **  ps,
const char *  from,
const char *  to,
uint8_t  flags 
)

Convert a string between encodings.

Parameters
[in,out]psString to convert
[in]fromCurrent character set
[in]toTarget character set
[in]flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
0Success
-1Invalid arguments or failure to open an iconv channel
errnoFailure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 831 of file charset.c.

832{
833 if (!ps)
834 return -1;
835
836 char *s = *ps;
837
838 if (!s || (*s == '\0'))
839 return 0;
840
841 if (!to || !from)
842 return -1;
843
844 const char *repls[] = { "\357\277\275", "?", 0 };
845 int rc = 0;
846
847 iconv_t cd = mutt_ch_iconv_open(to, from, flags);
848 if (!iconv_t_valid(cd))
849 return -1;
850
851 const char **inrepls = NULL;
852 const char *outrepl = NULL;
853
854 if (mutt_ch_is_utf8(to))
855 outrepl = "\357\277\275";
856 else if (mutt_ch_is_utf8(from))
857 inrepls = repls;
858 else
859 outrepl = "?";
860
861 const char *ib = s;
862 size_t ibl = strlen(s);
863 if (ibl >= (SIZE_MAX / MB_LEN_MAX))
864 {
865 return -1;
866 }
867 size_t obl = MB_LEN_MAX * ibl;
868 char *buf = mutt_mem_malloc(obl + 1);
869 char *ob = buf;
870
871 mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
872 iconv(cd, 0, 0, &ob, &obl);
873
874 *ob = '\0';
875
876 FREE(ps);
877 *ps = buf;
878
879 mutt_str_adjust(ps);
880 return rc;
881}
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:697
#define mutt_ch_is_utf8(str)
Definition: charset.h:97
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:293
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset ( const char *  cs,
bool  strict 
)

Does iconv understand a character set?

Parameters
csCharacter set to check
strictCheck strictly by using iconv
Return values
trueCharacter set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 894 of file charset.c.

895{
896 if (!cs)
897 return false;
898
899 if (mutt_ch_is_utf8(cs))
900 return true;
901
902 if (!strict)
903 {
904 for (int i = 0; PreferredMimeNames[i].key; i++)
905 {
906 if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
908 {
909 return true;
910 }
911 }
912 }
913
914 iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
915 if (iconv_t_valid(cd))
916 {
917 return true;
918 }
919
920 return false;
921}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv * mutt_ch_fgetconv_open ( FILE *  fp,
const char *  from,
const char *  to,
uint8_t  flags 
)

Prepare a file for charset conversion.

Parameters
fpFILE ptr to prepare
fromCurrent character set
toDestination character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptrfgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 933 of file charset.c.

934{
935 struct FgetConv *fc = NULL;
936 iconv_t cd = ICONV_T_INVALID;
937
938 if (from && to)
939 cd = mutt_ch_iconv_open(to, from, flags);
940
941 if (iconv_t_valid(cd))
942 {
943 static const char *repls[] = { "\357\277\275", "?", 0 };
944
945 fc = mutt_mem_malloc(sizeof(struct FgetConv));
946 fc->p = fc->bufo;
947 fc->ob = fc->bufo;
948 fc->ib = fc->bufi;
949 fc->ibl = 0;
950 fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
951 }
952 else
953 {
954 fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
955 }
956 fc->fp = fp;
957 fc->cd = cd;
958 return fc;
959}
#define ICONV_T_INVALID
Error value for iconv functions.
Definition: charset.h:101
A dummy converter.
Definition: charset.h:58
Cursor for converting a file's encoding.
Definition: charset.h:42
char bufi[512]
Definition: charset.h:45
iconv_t cd
iconv conversion descriptor
Definition: charset.h:44
char bufo[512]
Definition: charset.h:46
size_t ibl
Definition: charset.h:50
FILE * fp
Definition: charset.h:43
char * p
Definition: charset.h:47
const char ** inrepls
Definition: charset.h:51
char * ib
Definition: charset.h:49
char * ob
Definition: charset.h:48
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv **  ptr)

Close an fgetconv handle.

Parameters
[out]ptrfgetconv handle

Definition at line 965 of file charset.c.

966{
967 if (!ptr || !*ptr)
968 return;
969
970 FREE(ptr);
971}
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv fc)

Convert a file's character set.

Parameters
fcFgetConv handle
Return values
numNext character in the converted file
EOFError

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 983 of file charset.c.

984{
985 if (!fc)
986 return EOF;
987 if (!iconv_t_valid(fc->cd))
988 return fgetc(fc->fp);
989 if (!fc->p)
990 return EOF;
991 if (fc->p < fc->ob)
992 return (unsigned char) *(fc->p)++;
993
994 /* Try to convert some more */
995 fc->p = fc->bufo;
996 fc->ob = fc->bufo;
997 if (fc->ibl)
998 {
999 size_t obl = sizeof(fc->bufo);
1000 iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
1001 if (fc->p < fc->ob)
1002 return (unsigned char) *(fc->p)++;
1003 }
1004
1005 /* If we trusted iconv a bit more, we would at this point
1006 * ask why it had stopped converting ... */
1007
1008 /* Try to read some more */
1009 if ((fc->ibl == sizeof(fc->bufi)) ||
1010 (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
1011 {
1012 fc->p = 0;
1013 return EOF;
1014 }
1015 if (fc->ibl)
1016 memcpy(fc->bufi, fc->ib, fc->ibl);
1017 fc->ib = fc->bufi;
1018 fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
1019
1020 /* Try harder this time to convert some */
1021 if (fc->ibl)
1022 {
1023 size_t obl = sizeof(fc->bufo);
1024 mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
1025 fc->inrepls, 0, NULL);
1026 if (fc->p < fc->ob)
1027 return (unsigned char) *(fc->p)++;
1028 }
1029
1030 /* Either the file has finished or one of the buffers is too small */
1031 fc->p = 0;
1032 return EOF;
1033}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char * mutt_ch_fgetconvs ( char *  buf,
size_t  buflen,
struct FgetConv fc 
)

Convert a file's charset into a string buffer.

Parameters
bufBuffer for result
buflenLength of buffer
fcFgetConv handle
Return values
ptrSuccess, result buffer
NULLError

Read a file into a buffer, converting the character set as it goes.

Definition at line 1045 of file charset.c.

1046{
1047 if (!buf)
1048 return NULL;
1049
1050 size_t r;
1051 for (r = 0; (r + 1) < buflen;)
1052 {
1053 const int c = mutt_ch_fgetconv(fc);
1054 if (c == EOF)
1055 break;
1056 buf[r++] = (char) c;
1057 if (c == '\n')
1058 break;
1059 }
1060 buf[r] = '\0';
1061
1062 if (r > 0)
1063 return buf;
1064
1065 return NULL;
1066}
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:983
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char *  charset)

Update the records for a new character set.

Parameters
charsetNew character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note
This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1078 of file charset.c.

1079{
1080 char buf[256] = { 0 };
1081
1082 mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1083
1084 if (mutt_ch_is_utf8(buf))
1085 {
1086 CharsetIsUtf8 = true;
1087 ReplacementChar = 0xfffd; /* replacement character */
1088 }
1089 else
1090 {
1091 CharsetIsUtf8 = false;
1092 ReplacementChar = '?';
1093 }
1094
1095#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1096 bind_textdomain_codeset(PACKAGE, buf);
1097#endif
1098}
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:66
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:61
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_choose()

char * mutt_ch_choose ( const char *  fromcode,
const struct Slist charsets,
const char *  u,
size_t  ulen,
char **  d,
size_t *  dlen 
)

Figure the best charset to encode a string.

Parameters
[in]fromcodeOriginal charset of the string
[in]charsetsList of potential charsets to use
[in]uString to encode
[in]ulenLength of the string to encode
[out]dIf not NULL, point it to the converted string
[out]dlenIf not NULL, point it to the length of the d string
Return values
ptrBest performing charset
NULLNone could be found

Definition at line 1111 of file charset.c.

1113{
1114 if (!fromcode || !charsets)
1115 return NULL;
1116
1117 char *e = NULL, *tocode = NULL;
1118 size_t elen = 0, bestn = 0;
1119
1120 const struct ListNode *np = NULL;
1121 STAILQ_FOREACH(np, &charsets->head, entries)
1122 {
1123 char *t = mutt_str_dup(np->data);
1124 if (!t)
1125 continue;
1126
1127 size_t n = mutt_str_len(t);
1128 char *s = mutt_strn_dup(u, ulen);
1129 const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1130 mutt_ch_check(s, ulen, fromcode, t);
1131 if (rc)
1132 {
1133 FREE(&t);
1134 FREE(&s);
1135 continue;
1136 }
1137 size_t slen = mutt_str_len(s);
1138
1139 if (!tocode || (n < bestn))
1140 {
1141 bestn = n;
1142 FREE(&tocode);
1143 tocode = t;
1144 if (d)
1145 {
1146 FREE(&e);
1147 e = s;
1148 }
1149 else
1150 {
1151 FREE(&s);
1152 }
1153 elen = slen;
1154 }
1155 else
1156 {
1157 FREE(&t);
1158 FREE(&s);
1159 }
1160 }
1161 if (tocode)
1162 {
1163 if (d)
1164 *d = e;
1165 if (dlen)
1166 *dlen = elen;
1167
1168 char canonical_buf[1024] = { 0 };
1169 mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1170 mutt_str_replace(&tocode, canonical_buf);
1171 }
1172 return tocode;
1173}
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:796
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:274
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_cache_cleanup()

void mutt_ch_cache_cleanup ( void  )

Clean up the cached iconv handles and charset strings.

Definition at line 1178 of file charset.c.

1179{
1180 for (int i = 0; i < IconvCacheUsed; ++i)
1181 {
1182 FREE(&IconvCache[i].fromcode1);
1183 FREE(&IconvCache[i].tocode1);
1184 if (iconv_t_valid(IconvCache[i].cd))
1185 {
1186 iconv_close(IconvCache[i].cd);
1187 }
1188 }
1189 IconvCacheUsed = 0;
1190}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ ReplacementChar

wchar_t ReplacementChar = '?'

When a Unicode character can't be displayed, use this instead.

Definition at line 61 of file charset.c.

◆ CharsetIsUtf8

bool CharsetIsUtf8 = false

Is the user's current character set utf-8?

Definition at line 66 of file charset.c.

◆ Lookups

struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
static

Lookup table of preferred character set names.

Definition at line 83 of file charset.c.

◆ IconvCache

struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
static

Cache of iconv conversion descriptors.

Definition at line 98 of file charset.c.

◆ IconvCacheUsed

int IconvCacheUsed = 0
static

Number of iconv descriptors in the cache.

Definition at line 100 of file charset.c.

◆ PreferredMimeNames

const struct MimeNames PreferredMimeNames[]
static

Lookup table of preferred charsets.

The following list has been created manually from the data under: http://www.isi.edu/in-notes/iana/assignments/character-sets Last update: 2000-09-07

Note
It includes only the subset of character sets for which a preferred MIME name is given.

Definition at line 121 of file charset.c.