NeoMutt  2023-05-17-56-ga67199
Teaching an old dog new tricks
DOXYGEN
charset.c File Reference

Conversion between different character encodings. More...

#include "config.h"
#include <ctype.h>
#include <errno.h>
#include <iconv.h>
#include <langinfo.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include "charset.h"
#include "lib.h"
#include "memory.h"
#include "queue.h"
#include "regex3.h"
#include <libintl.h>
+ Include dependency graph for charset.c:

Go to the source code of this file.

Data Structures

struct  Lookup
 Regex to String lookup table. More...
 
struct  IconvCacheEntry
 Cached iconv conversion descriptor. More...
 
struct  MimeNames
 MIME name lookup entry. More...
 

Macros

#define EILSEQ   EINVAL
 
#define ICONV_CACHE_SIZE   16
 Max size of the iconv cache. More...
 

Functions

 TAILQ_HEAD (LookupList, Lookup)
 
static struct Lookuplookup_new (void)
 Create a new Lookup. More...
 
static void lookup_free (struct Lookup **ptr)
 Free a Lookup. More...
 
static const char * lookup_charset (enum LookupType type, const char *cs)
 Look for a preferred character set name. More...
 
int mutt_ch_convert_nonmime_string (const struct Slist *const assumed_charset, const char *charset, char **ps)
 Try to convert a string using a list of character sets. More...
 
void mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name)
 Canonicalise the charset of a string. More...
 
bool mutt_ch_chscmp (const char *cs1, const char *cs2)
 Are the names of two character sets equivalent? More...
 
const char * mutt_ch_get_default_charset (const struct Slist *const assumed_charset)
 Get the default character set. More...
 
char * mutt_ch_get_langinfo_charset (void)
 Get the user's choice of character set. More...
 
bool mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
 Add a new character set lookup. More...
 
void mutt_ch_lookup_remove (void)
 Remove all the character set lookups. More...
 
const char * mutt_ch_charset_lookup (const char *chs)
 Look for a replacement character set. More...
 
iconv_t mutt_ch_iconv_open (const char *tocode, const char *fromcode, uint8_t flags)
 Set up iconv for conversions. More...
 
size_t mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
 Change the encoding of a string. More...
 
const char * mutt_ch_iconv_lookup (const char *chs)
 Look for a replacement character set. More...
 
int mutt_ch_check (const char *s, size_t slen, const char *from, const char *to)
 Check whether a string can be converted between encodings. More...
 
int mutt_ch_convert_string (char **ps, const char *from, const char *to, uint8_t flags)
 Convert a string between encodings. More...
 
bool mutt_ch_check_charset (const char *cs, bool strict)
 Does iconv understand a character set? More...
 
struct FgetConvmutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, uint8_t flags)
 Prepare a file for charset conversion. More...
 
void mutt_ch_fgetconv_close (struct FgetConv **fc)
 Close an fgetconv handle. More...
 
int mutt_ch_fgetconv (struct FgetConv *fc)
 Convert a file's character set. More...
 
char * mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc)
 Convert a file's charset into a string buffer. More...
 
void mutt_ch_set_charset (const char *charset)
 Update the records for a new character set. More...
 
char * mutt_ch_choose (const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
 Figure the best charset to encode a string. More...
 
void mutt_ch_cache_cleanup (void)
 Clean up the cached iconv handles and charset strings. More...
 

Variables

wchar_t ReplacementChar = '?'
 When a Unicode character can't be displayed, use this instead. More...
 
bool CharsetIsUtf8 = false
 Is the user's current character set utf-8? More...
 
static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
 Lookup table of preferred character set names. More...
 
static struct IconvCacheEntry IconvCache [ICONV_CACHE_SIZE]
 Cache of iconv conversion descriptors. More...
 
static int IconvCacheUsed = 0
 Number of iconv descriptors in the cache. More...
 
static const struct MimeNames PreferredMimeNames []
 Lookup table of preferred charsets. More...
 

Detailed Description

Conversion between different character encodings.

Authors
  • Thomas Roessler

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.c.

Macro Definition Documentation

◆ EILSEQ

#define EILSEQ   EINVAL

Definition at line 48 of file charset.c.

◆ ICONV_CACHE_SIZE

#define ICONV_CACHE_SIZE   16

Max size of the iconv cache.

Definition at line 89 of file charset.c.

Function Documentation

◆ TAILQ_HEAD()

TAILQ_HEAD ( LookupList  ,
Lookup   
)

◆ lookup_new()

static struct Lookup * lookup_new ( void  )
static

Create a new Lookup.

Return values
ptrNew Lookup

Definition at line 262 of file charset.c.

263{
264 return mutt_mem_calloc(1, sizeof(struct Lookup));
265}
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
Regex to String lookup table.
Definition: charset.c:67
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ lookup_free()

static void lookup_free ( struct Lookup **  ptr)
static

Free a Lookup.

Parameters
ptrLookup to free

Definition at line 271 of file charset.c.

272{
273 if (!ptr || !*ptr)
274 return;
275
276 struct Lookup *l = *ptr;
277 FREE(&l->replacement);
278 FREE(&l->regex.pattern);
279 if (l->regex.regex)
280 regfree(l->regex.regex);
281 FREE(&l->regex.regex);
282 FREE(&l->regex);
283
284 FREE(ptr);
285}
#define FREE(x)
Definition: memory.h:43
char * replacement
Alternative charset to use.
Definition: charset.c:70
struct Regex regex
Regular expression.
Definition: charset.c:69
char * pattern
printable version
Definition: regex3.h:90
regex_t * regex
compiled expression
Definition: regex3.h:91
+ Here is the caller graph for this function:

◆ lookup_charset()

static const char * lookup_charset ( enum LookupType  type,
const char *  cs 
)
static

Look for a preferred character set name.

Parameters
typeType, e.g. MUTT_LOOKUP_CHARSET
csCharacter set
Return values
ptrCharset string

If the character set matches one of the regexes, then return the replacement name.

Definition at line 296 of file charset.c.

297{
298 if (!cs)
299 return NULL;
300
301 struct Lookup *l = NULL;
302
303 TAILQ_FOREACH(l, &Lookups, entries)
304 {
305 if (l->type != type)
306 continue;
307 if (mutt_regex_match(&l->regex, cs))
308 return l->replacement;
309 }
310 return NULL;
311}
static struct LookupList Lookups
Lookup table of preferred character set names.
Definition: charset.c:76
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:635
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:725
enum LookupType type
Lookup type.
Definition: charset.c:68
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string ( const struct Slist *const  assumed_charset,
const char *  charset,
char **  ps 
)

Try to convert a string using a list of character sets.

Parameters
[in]assumed_charsetFrom $assumed_charset
[in]charsetFrom $charset
[in,out]psString to be converted
Return values
0Success
-1Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 324 of file charset.c.

326{
327 if (!ps)
328 return -1;
329
330 char *u = *ps;
331 const size_t ulen = mutt_str_len(u);
332 if (ulen == 0)
333 return 0;
334
335 const struct ListNode *np = NULL;
336 STAILQ_FOREACH(np, &assumed_charset->head, entries)
337 {
338 char const *c = np->data;
339 size_t n = mutt_str_len(c);
340 char *fromcode = mutt_mem_malloc(n + 1);
341 mutt_str_copy(fromcode, c, n + 1);
342 char *s = mutt_strn_dup(u, ulen);
343 int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);
344 FREE(&fromcode);
345 if (m == 0)
346 {
347 FREE(ps);
348 *ps = s;
349 return 0;
350 }
351 FREE(&s);
352 }
354 charset, MUTT_ICONV_HOOK_FROM);
355 return -1;
356}
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:822
const char * mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
Get the default character set.
Definition: charset.c:456
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:452
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:568
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:653
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
A List node for strings.
Definition: list.h:35
char * data
String.
Definition: list.h:36
struct ListHead head
List containing values.
Definition: slist.h:48
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset ( char *  buf,
size_t  buflen,
const char *  name 
)

Canonicalise the charset of a string.

Parameters
bufBuffer for canonical character set name
buflenLength of buffer
nameName to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 367 of file charset.c.

368{
369 if (!buf || !name)
370 return;
371
372 char in[1024], scratch[1024 + 10];
373
374 mutt_str_copy(in, name, sizeof(in));
375 char *ext = strchr(in, '/');
376 if (ext)
377 *ext++ = '\0';
378
379 if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
380 {
381 mutt_str_copy(buf, "utf-8", buflen);
382 goto out;
383 }
384
385 /* catch some common iso-8859-something misspellings */
386 size_t plen;
387 if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
388 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
389 else if ((plen = mutt_istr_startswith(in, "8859-")))
390 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
391 else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
392 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
393 else if ((plen = mutt_istr_startswith(in, "iso8859-")))
394 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
395 else
396 mutt_str_copy(scratch, in, sizeof(scratch));
397
398 for (size_t i = 0; PreferredMimeNames[i].key; i++)
399 {
400 if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
401 {
402 mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
403 goto out;
404 }
405 }
406
407 mutt_str_copy(buf, scratch, buflen);
408
409 /* for cosmetics' sake, transform to lowercase. */
410 for (char *p = buf; *p; p++)
411 *p = tolower(*p);
412
413out:
414 if (ext && *ext)
415 {
416 mutt_str_cat(buf, buflen, "/");
417 mutt_str_cat(buf, buflen, ext);
418 }
419}
static const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:114
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:810
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:240
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:266
const char * key
Definition: charset.c:100
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp ( const char *  cs1,
const char *  cs2 
)

Are the names of two character sets equivalent?

Parameters
cs1First character set
cs2Second character set
Return values
trueNames are equivalent
falseNames differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 433 of file charset.c.

434{
435 if (!cs1 || !cs2)
436 return false;
437
438 char buf[256] = { 0 };
439
440 mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
441
442 int len1 = mutt_str_len(buf);
443 int len2 = mutt_str_len(cs2);
444
445 return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
446 ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
447}
#define MIN(a, b)
Definition: memory.h:31
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:367
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:525
+ Here is the call graph for this function:

◆ mutt_ch_get_default_charset()

const char * mutt_ch_get_default_charset ( const struct Slist *const  assumed_charset)

Get the default character set.

Parameters
assumed_charsetFrom $assumed_charset
Return values
ptrName of the default character set
Warning
This returns a pointer to a static buffer. Do not free it.

Definition at line 456 of file charset.c.

457{
458 static char fcharset[128];
459 const char *c = NULL;
460
461 if (assumed_charset && (assumed_charset->count > 0))
462 c = STAILQ_FIRST(&assumed_charset->head)->data;
463 else
464 c = "us-ascii";
465
466 mutt_str_copy(fcharset, c, sizeof(fcharset));
467 return fcharset;
468}
#define STAILQ_FIRST(head)
Definition: queue.h:350
size_t count
Number of values in list.
Definition: slist.h:49
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char * mutt_ch_get_langinfo_charset ( void  )

Get the user's choice of character set.

Return values
ptrCharset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 477 of file charset.c.

478{
479 char buf[1024] = { 0 };
480
481 mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
482
483 if (buf[0] != '\0')
484 return mutt_str_dup(buf);
485
486 return mutt_str_dup("iso-8859-1");
487}
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:251
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add ( enum LookupType  type,
const char *  pat,
const char *  replace,
struct Buffer err 
)

Add a new character set lookup.

Parameters
typeType of character set, e.g. MUTT_LOOKUP_CHARSET
patPattern to match
replaceReplacement string
errBuffer for error message
Return values
trueLookup added to list
falseRegex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 500 of file charset.c.

502{
503 if (!pat || !replace)
504 return false;
505
506 regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
507 int rc = REG_COMP(rx, pat, REG_ICASE);
508 if (rc != 0)
509 {
510 regerror(rc, rx, err->data, err->dsize);
511 FREE(&rx);
512 return false;
513 }
514
515 struct Lookup *l = lookup_new();
516 l->type = type;
517 l->replacement = mutt_str_dup(replace);
518 l->regex.pattern = mutt_str_dup(pat);
519 l->regex.regex = rx;
520 l->regex.pat_not = false;
521
522 TAILQ_INSERT_TAIL(&Lookups, l, entries);
523
524 return true;
525}
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:262
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:53
size_t dsize
Length of data.
Definition: buffer.h:37
char * data
Pointer to data.
Definition: buffer.h:35
bool pat_not
do not match
Definition: regex3.h:92
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void  )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 532 of file charset.c.

533{
534 struct Lookup *l = NULL;
535 struct Lookup *tmp = NULL;
536
537 TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
538 {
539 TAILQ_REMOVE(&Lookups, l, entries);
540 lookup_free(&l);
541 }
542}
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:271
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char * mutt_ch_charset_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'charset-hook' matches)
NULLNo matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 553 of file charset.c.

554{
556}
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:296
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:67
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open ( const char *  tocode,
const char *  fromcode,
uint8_t  flags 
)

Set up iconv for conversions.

Parameters
tocodeCurrent character set
fromcodeTarget character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptriconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Since calling iconv_open() repeatedly can be expensive, we keep a cache of the most recently used iconv_t objects, kept in LRU order. This means that you should not call iconv_close() on the object yourself. All remaining objects in the cache will exit when main() calls mutt_ch_cache_cleanup().

Note
By design charset-hooks should never be, and are never, applied to tocode.
The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 585 of file charset.c.

586{
587 char tocode1[128];
588 char fromcode1[128];
589 const char *tocode2 = NULL, *fromcode2 = NULL;
590 const char *tmp = NULL;
591
592 /* transform to MIME preferred charset names */
593 mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
594 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
595
596 /* maybe apply charset-hooks and recanonicalise fromcode,
597 * but only when caller asked us to sanitize a potentially wrong
598 * charset name incoming from the wild exterior. */
599 if (flags & MUTT_ICONV_HOOK_FROM)
600 {
601 tmp = mutt_ch_charset_lookup(fromcode1);
602 if (tmp)
603 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
604 }
605
606 /* check if we have this pair cached already */
607 for (int i = 0; i < IconvCacheUsed; ++i)
608 {
609 if (strcmp(tocode1, IconvCache[i].tocode1) == 0 &&
610 strcmp(fromcode1, IconvCache[i].fromcode1) == 0)
611 {
612 iconv_t cd = IconvCache[i].cd;
613
614 /* make room for this one at the top */
615 struct IconvCacheEntry top = IconvCache[i];
616 for (int j = i; j-- > 0;)
617 {
618 IconvCache[j + 1] = IconvCache[j];
619 }
620 IconvCache[0] = top;
621
622 if (iconv_t_valid(cd))
623 {
624 /* reset state */
625 iconv(cd, NULL, NULL, NULL, NULL);
626 }
627 return cd;
628 }
629 }
630
631 /* not found in cache */
632 /* always apply iconv-hooks to suit system's iconv tastes */
633 tocode2 = mutt_ch_iconv_lookup(tocode1);
634 tocode2 = tocode2 ? tocode2 : tocode1;
635 fromcode2 = mutt_ch_iconv_lookup(fromcode1);
636 fromcode2 = fromcode2 ? fromcode2 : fromcode1;
637
638 /* call system iconv with names it appreciates */
639 iconv_t cd = iconv_open(tocode2, fromcode2);
640
642 {
643 mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n",
646 /* get rid of the oldest entry */
650 {
651 iconv_close(IconvCache[IconvCacheUsed - 1].cd);
652 }
654 }
655
656 /* make room for this one at the top */
657 for (int j = IconvCacheUsed; j-- > 0;)
658 {
659 IconvCache[j + 1] = IconvCache[j];
660 }
661
663
664 mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1);
665 IconvCache[0].fromcode1 = strdup(fromcode1);
666 IconvCache[0].tocode1 = strdup(tocode1);
667 IconvCache[0].cd = cd;
668
669 return cd;
670}
#define mutt_debug(LEVEL,...)
Definition: logging2.h:87
@ LL_DEBUG2
Log at debug level 2.
Definition: logging2.h:44
static int IconvCacheUsed
Number of iconv descriptors in the cache.
Definition: charset.c:93
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:772
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:553
#define ICONV_CACHE_SIZE
Max size of the iconv cache.
Definition: charset.c:89
static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
Cache of iconv conversion descriptors.
Definition: charset.c:91
static bool iconv_t_valid(const iconv_t cd)
Is the conversion descriptor valid?
Definition: charset.h:112
Cached iconv conversion descriptor.
Definition: charset.c:82
char * tocode1
Destination character set.
Definition: charset.c:84
char * fromcode1
Source character set.
Definition: charset.c:83
iconv_t cd
iconv conversion descriptor
Definition: charset.c:85
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv ( iconv_t  cd,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft,
const char **  inrepls,
const char *  outrepl,
int *  iconverrno 
)

Change the encoding of a string.

Parameters
[in]cdIconv conversion descriptor
[in,out]inbufBuffer to convert
[in,out]inbytesleftLength of buffer to convert
[in,out]outbufBuffer for the result
[in,out]outbytesleftLength of result buffer
[in]inreplsInput replacement characters
[in]outreplOutput replacement characters
[out]iconverrnoErrno if iconv() fails, 0 if it succeeds
Return values
numCharacters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 688 of file charset.c.

691{
692 size_t rc = 0;
693 const char *ib = *inbuf;
694 size_t ibl = *inbytesleft;
695 char *ob = *outbuf;
696 size_t obl = *outbytesleft;
697
698 while (true)
699 {
700 errno = 0;
701 const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
702 if (ret1 != ICONV_ILLEGAL_SEQ)
703 rc += ret1;
704 if (iconverrno)
705 *iconverrno = errno;
706
707 if (ibl && obl && (errno == EILSEQ))
708 {
709 if (inrepls)
710 {
711 /* Try replacing the input */
712 const char **t = NULL;
713 for (t = inrepls; *t; t++)
714 {
715 const char *ib1 = *t;
716 size_t ibl1 = strlen(*t);
717 char *ob1 = ob;
718 size_t obl1 = obl;
719 iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
720 if (ibl1 == 0)
721 {
722 ib++;
723 ibl--;
724 ob = ob1;
725 obl = obl1;
726 rc++;
727 break;
728 }
729 }
730 if (*t)
731 continue;
732 }
733 /* Replace the output */
734 if (!outrepl)
735 outrepl = "?";
736 iconv(cd, NULL, NULL, &ob, &obl);
737 if (obl)
738 {
739 int n = strlen(outrepl);
740 if (n > obl)
741 {
742 outrepl = "?";
743 n = 1;
744 }
745 memcpy(ob, outrepl, n);
746 ib++;
747 ibl--;
748 ob += n;
749 obl -= n;
750 rc++;
751 iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
752 continue;
753 }
754 }
755 *inbuf = ib;
756 *inbytesleft = ibl;
757 *outbuf = ob;
758 *outbytesleft = obl;
759 return rc;
760 }
761}
#define EILSEQ
Definition: charset.c:48
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition: charset.h:103
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char * mutt_ch_iconv_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'iconv-hook' matches)
NULLNo matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 772 of file charset.c.

773{
775}
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:68
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check ( const char *  s,
size_t  slen,
const char *  from,
const char *  to 
)

Check whether a string can be converted between encodings.

Parameters
[in]sString to check
[in]slenLength of the string to check
[in]fromCurrent character set
[in]toTarget character set
Return values
0Success
-1Error in iconv_open()
>0Errno as set by iconv()

Definition at line 787 of file charset.c.

788{
789 if (!s || !from || !to)
790 return -1;
791
792 int rc = 0;
793 iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
794 if (!iconv_t_valid(cd))
795 return -1;
796
797 size_t outlen = MB_LEN_MAX * slen;
798 char *out = mutt_mem_malloc(outlen + 1);
799 char *saved_out = out;
800
801 const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
802 if (convlen == ICONV_ILLEGAL_SEQ)
803 rc = errno;
804
805 FREE(&saved_out);
806 return rc;
807}
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:585
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string ( char **  ps,
const char *  from,
const char *  to,
uint8_t  flags 
)

Convert a string between encodings.

Parameters
[in,out]psString to convert
[in]fromCurrent character set
[in]toTarget character set
[in]flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
0Success
-1Invalid arguments or failure to open an iconv channel
errnoFailure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 822 of file charset.c.

823{
824 if (!ps)
825 return -1;
826
827 char *s = *ps;
828
829 if (!s || (*s == '\0'))
830 return 0;
831
832 if (!to || !from)
833 return -1;
834
835 const char *repls[] = { "\357\277\275", "?", 0 };
836 int rc = 0;
837
838 iconv_t cd = mutt_ch_iconv_open(to, from, flags);
839 if (!iconv_t_valid(cd))
840 return -1;
841
842 const char **inrepls = NULL;
843 const char *outrepl = NULL;
844
845 if (mutt_ch_is_utf8(to))
846 outrepl = "\357\277\275";
847 else if (mutt_ch_is_utf8(from))
848 inrepls = repls;
849 else
850 outrepl = "?";
851
852 const char *ib = s;
853 size_t ibl = strlen(s);
854 if (ibl >= (SIZE_MAX / MB_LEN_MAX))
855 {
856 return -1;
857 }
858 size_t obl = MB_LEN_MAX * ibl;
859 char *buf = mutt_mem_malloc(obl + 1);
860 char *ob = buf;
861
862 mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
863 iconv(cd, 0, 0, &ob, &obl);
864
865 *ob = '\0';
866
867 FREE(ps);
868 *ps = buf;
869
870 mutt_str_adjust(ps);
871 return rc;
872}
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:688
#define mutt_ch_is_utf8(str)
Definition: charset.h:96
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:371
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset ( const char *  cs,
bool  strict 
)

Does iconv understand a character set?

Parameters
csCharacter set to check
strictCheck strictly by using iconv
Return values
trueCharacter set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 885 of file charset.c.

886{
887 if (!cs)
888 return false;
889
890 if (mutt_ch_is_utf8(cs))
891 return true;
892
893 if (!strict)
894 {
895 for (int i = 0; PreferredMimeNames[i].key; i++)
896 {
897 if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
899 {
900 return true;
901 }
902 }
903 }
904
905 iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
906 if (iconv_t_valid(cd))
907 {
908 return true;
909 }
910
911 return false;
912}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv * mutt_ch_fgetconv_open ( FILE *  fp,
const char *  from,
const char *  to,
uint8_t  flags 
)

Prepare a file for charset conversion.

Parameters
fpFILE ptr to prepare
fromCurrent character set
toDestination character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptrfgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 924 of file charset.c.

925{
926 struct FgetConv *fc = NULL;
927 iconv_t cd = ICONV_T_INVALID;
928
929 if (from && to)
930 cd = mutt_ch_iconv_open(to, from, flags);
931
932 if (iconv_t_valid(cd))
933 {
934 static const char *repls[] = { "\357\277\275", "?", 0 };
935
936 fc = mutt_mem_malloc(sizeof(struct FgetConv));
937 fc->p = fc->bufo;
938 fc->ob = fc->bufo;
939 fc->ib = fc->bufi;
940 fc->ibl = 0;
941 fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
942 }
943 else
944 {
945 fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
946 }
947 fc->fp = fp;
948 fc->cd = cd;
949 return fc;
950}
#define ICONV_T_INVALID
Error value for iconv functions.
Definition: charset.h:100
A dummy converter.
Definition: charset.h:57
Cursor for converting a file's encoding.
Definition: charset.h:41
char bufi[512]
Definition: charset.h:44
iconv_t cd
iconv conversion descriptor
Definition: charset.h:43
char bufo[512]
Definition: charset.h:45
size_t ibl
Definition: charset.h:49
FILE * fp
Definition: charset.h:42
char * p
Definition: charset.h:46
const char ** inrepls
Definition: charset.h:50
char * ib
Definition: charset.h:48
char * ob
Definition: charset.h:47
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv **  fc)

Close an fgetconv handle.

Parameters
[out]fcfgetconv handle

Definition at line 956 of file charset.c.

957{
958 if (!fc || !*fc)
959 return;
960
961 FREE(fc);
962}
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv fc)

Convert a file's character set.

Parameters
fcFgetConv handle
Return values
numNext character in the converted file
EOFError

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 974 of file charset.c.

975{
976 if (!fc)
977 return EOF;
978 if (!iconv_t_valid(fc->cd))
979 return fgetc(fc->fp);
980 if (!fc->p)
981 return EOF;
982 if (fc->p < fc->ob)
983 return (unsigned char) *(fc->p)++;
984
985 /* Try to convert some more */
986 fc->p = fc->bufo;
987 fc->ob = fc->bufo;
988 if (fc->ibl)
989 {
990 size_t obl = sizeof(fc->bufo);
991 iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
992 if (fc->p < fc->ob)
993 return (unsigned char) *(fc->p)++;
994 }
995
996 /* If we trusted iconv a bit more, we would at this point
997 * ask why it had stopped converting ... */
998
999 /* Try to read some more */
1000 if ((fc->ibl == sizeof(fc->bufi)) ||
1001 (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
1002 {
1003 fc->p = 0;
1004 return EOF;
1005 }
1006 if (fc->ibl)
1007 memcpy(fc->bufi, fc->ib, fc->ibl);
1008 fc->ib = fc->bufi;
1009 fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
1010
1011 /* Try harder this time to convert some */
1012 if (fc->ibl)
1013 {
1014 size_t obl = sizeof(fc->bufo);
1015 mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
1016 fc->inrepls, 0, NULL);
1017 if (fc->p < fc->ob)
1018 return (unsigned char) *(fc->p)++;
1019 }
1020
1021 /* Either the file has finished or one of the buffers is too small */
1022 fc->p = 0;
1023 return EOF;
1024}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char * mutt_ch_fgetconvs ( char *  buf,
size_t  buflen,
struct FgetConv fc 
)

Convert a file's charset into a string buffer.

Parameters
bufBuffer for result
buflenLength of buffer
fcFgetConv handle
Return values
ptrSuccess, result buffer
NULLError

Read a file into a buffer, converting the character set as it goes.

Definition at line 1036 of file charset.c.

1037{
1038 if (!buf)
1039 return NULL;
1040
1041 size_t r;
1042 for (r = 0; (r + 1) < buflen;)
1043 {
1044 const int c = mutt_ch_fgetconv(fc);
1045 if (c == EOF)
1046 break;
1047 buf[r++] = (char) c;
1048 if (c == '\n')
1049 break;
1050 }
1051 buf[r] = '\0';
1052
1053 if (r > 0)
1054 return buf;
1055
1056 return NULL;
1057}
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:974
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char *  charset)

Update the records for a new character set.

Parameters
charsetNew character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note
This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1069 of file charset.c.

1070{
1071 char buf[256] = { 0 };
1072
1073 mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1074
1075 if (mutt_ch_is_utf8(buf))
1076 {
1077 CharsetIsUtf8 = true;
1078 ReplacementChar = 0xfffd; /* replacement character */
1079 }
1080 else
1081 {
1082 CharsetIsUtf8 = false;
1083 ReplacementChar = '?';
1084 }
1085
1086#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1087 bind_textdomain_codeset(PACKAGE, buf);
1088#endif
1089}
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:59
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:54
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_choose()

char * mutt_ch_choose ( const char *  fromcode,
const struct Slist charsets,
const char *  u,
size_t  ulen,
char **  d,
size_t *  dlen 
)

Figure the best charset to encode a string.

Parameters
[in]fromcodeOriginal charset of the string
[in]charsetsList of potential charsets to use
[in]uString to encode
[in]ulenLength of the string to encode
[out]dIf not NULL, point it to the converted string
[out]dlenIf not NULL, point it to the length of the d string
Return values
ptrBest performing charset
NULLNone could be found

Definition at line 1102 of file charset.c.

1104{
1105 if (!fromcode || !charsets)
1106 return NULL;
1107
1108 char *e = NULL, *tocode = NULL;
1109 size_t elen = 0, bestn = 0;
1110
1111 const struct ListNode *np = NULL;
1112 STAILQ_FOREACH(np, &charsets->head, entries)
1113 {
1114 char *t = mutt_str_dup(np->data);
1115 if (!t)
1116 continue;
1117
1118 size_t n = mutt_str_len(t);
1119 char *s = mutt_strn_dup(u, ulen);
1120 const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1121 mutt_ch_check(s, ulen, fromcode, t);
1122 if (rc)
1123 {
1124 FREE(&t);
1125 FREE(&s);
1126 continue;
1127 }
1128 size_t slen = mutt_str_len(s);
1129
1130 if (!tocode || (n < bestn))
1131 {
1132 bestn = n;
1133 FREE(&tocode);
1134 tocode = t;
1135 if (d)
1136 {
1137 FREE(&e);
1138 e = s;
1139 }
1140 else
1141 {
1142 FREE(&s);
1143 }
1144 elen = slen;
1145 }
1146 else
1147 {
1148 FREE(&t);
1149 FREE(&s);
1150 }
1151 }
1152 if (tocode)
1153 {
1154 if (d)
1155 *d = e;
1156 if (dlen)
1157 *dlen = elen;
1158
1159 char canonical_buf[1024] = { 0 };
1160 mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1161 mutt_str_replace(&tocode, canonical_buf);
1162 }
1163 return tocode;
1164}
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:787
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:327
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_cache_cleanup()

void mutt_ch_cache_cleanup ( void  )

Clean up the cached iconv handles and charset strings.

Definition at line 1169 of file charset.c.

1170{
1171 for (int i = 0; i < IconvCacheUsed; ++i)
1172 {
1173 FREE(&IconvCache[i].fromcode1);
1174 FREE(&IconvCache[i].tocode1);
1175 if (iconv_t_valid(IconvCache[i].cd))
1176 {
1177 iconv_close(IconvCache[i].cd);
1178 }
1179 }
1180 IconvCacheUsed = 0;
1181}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ ReplacementChar

wchar_t ReplacementChar = '?'

When a Unicode character can't be displayed, use this instead.

Definition at line 54 of file charset.c.

◆ CharsetIsUtf8

bool CharsetIsUtf8 = false

Is the user's current character set utf-8?

Definition at line 59 of file charset.c.

◆ Lookups

struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
static

Lookup table of preferred character set names.

Definition at line 76 of file charset.c.

◆ IconvCache

struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
static

Cache of iconv conversion descriptors.

Definition at line 91 of file charset.c.

◆ IconvCacheUsed

int IconvCacheUsed = 0
static

Number of iconv descriptors in the cache.

Definition at line 93 of file charset.c.

◆ PreferredMimeNames

const struct MimeNames PreferredMimeNames[]
static

Lookup table of preferred charsets.

The following list has been created manually from the data under: http://www.isi.edu/in-notes/iana/assignments/character-sets Last update: 2000-09-07

Note
It includes only the subset of character sets for which a preferred MIME name is given.

Definition at line 114 of file charset.c.