NeoMutt  2022-04-29-249-gaae397
Teaching an old dog new tricks
DOXYGEN
charset.c File Reference

Conversion between different character encodings. More...

#include "config.h"
#include <ctype.h>
#include <errno.h>
#include <iconv.h>
#include <langinfo.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include "config/lib.h"
#include "core/lib.h"
#include "charset.h"
#include "lib.h"
#include "memory.h"
#include "queue.h"
#include "regex3.h"
#include "string2.h"
#include <libintl.h>
+ Include dependency graph for charset.c:

Go to the source code of this file.

Data Structures

struct  Lookup
 Regex to String lookup table. More...
 
struct  MimeNames
 MIME name lookup entry. More...
 

Macros

#define EILSEQ   EINVAL
 

Functions

 TAILQ_HEAD (LookupList, Lookup)
 
static struct Lookuplookup_new (void)
 Create a new Lookup. More...
 
static void lookup_free (struct Lookup **ptr)
 Free a Lookup. More...
 
static const char * lookup_charset (enum LookupType type, const char *cs)
 Look for a preferred character set name. More...
 
int mutt_ch_convert_nonmime_string (char **ps)
 Try to convert a string using a list of character sets. More...
 
void mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name)
 Canonicalise the charset of a string. More...
 
bool mutt_ch_chscmp (const char *cs1, const char *cs2)
 Are the names of two character sets equivalent? More...
 
char * mutt_ch_get_default_charset (void)
 Get the default character set. More...
 
char * mutt_ch_get_langinfo_charset (void)
 Get the user's choice of character set. More...
 
bool mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
 Add a new character set lookup. More...
 
void mutt_ch_lookup_remove (void)
 Remove all the character set lookups. More...
 
const char * mutt_ch_charset_lookup (const char *chs)
 Look for a replacement character set. More...
 
iconv_t mutt_ch_iconv_open (const char *tocode, const char *fromcode, uint8_t flags)
 Set up iconv for conversions. More...
 
size_t mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
 Change the encoding of a string. More...
 
const char * mutt_ch_iconv_lookup (const char *chs)
 Look for a replacement character set. More...
 
int mutt_ch_check (const char *s, size_t slen, const char *from, const char *to)
 Check whether a string can be converted between encodings. More...
 
int mutt_ch_convert_string (char **ps, const char *from, const char *to, uint8_t flags)
 Convert a string between encodings. More...
 
bool mutt_ch_check_charset (const char *cs, bool strict)
 Does iconv understand a character set? More...
 
struct FgetConvmutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, uint8_t flags)
 Prepare a file for charset conversion. More...
 
void mutt_ch_fgetconv_close (struct FgetConv **fc)
 Close an fgetconv handle. More...
 
int mutt_ch_fgetconv (struct FgetConv *fc)
 Convert a file's character set. More...
 
char * mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc)
 Convert a file's charset into a string buffer. More...
 
void mutt_ch_set_charset (const char *charset)
 Update the records for a new character set. More...
 
char * mutt_ch_choose (const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
 Figure the best charset to encode a string. More...
 

Variables

wchar_t ReplacementChar = '?'
 When a Unicode character can't be displayed, use this instead. More...
 
bool CharsetIsUtf8 = false
 Is the user's current character set utf-8? More...
 
static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
 
const struct MimeNames PreferredMimeNames []
 Lookup table of preferred charsets. More...
 

Detailed Description

Conversion between different character encodings.

Authors
  • Thomas Roessler

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.c.

Macro Definition Documentation

◆ EILSEQ

#define EILSEQ   EINVAL

Definition at line 51 of file charset.c.

Function Documentation

◆ TAILQ_HEAD()

TAILQ_HEAD ( LookupList  ,
Lookup   
)

◆ lookup_new()

static struct Lookup * lookup_new ( void  )
static

Create a new Lookup.

Return values
ptrNew Lookup

Definition at line 247 of file charset.c.

248{
249 return mutt_mem_calloc(1, sizeof(struct Lookup));
250}
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
Regex to String lookup table.
Definition: charset.c:70
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ lookup_free()

static void lookup_free ( struct Lookup **  ptr)
static

Free a Lookup.

Parameters
ptrLookup to free

Definition at line 256 of file charset.c.

257{
258 if (!ptr || !*ptr)
259 return;
260
261 struct Lookup *l = *ptr;
262 FREE(&l->replacement);
263 FREE(&l->regex.pattern);
264 if (l->regex.regex)
265 regfree(l->regex.regex);
266 FREE(&l->regex.regex);
267 FREE(&l->regex);
268
269 FREE(ptr);
270}
#define FREE(x)
Definition: memory.h:43
char * replacement
Alternative charset to use.
Definition: charset.c:73
struct Regex regex
Regular expression.
Definition: charset.c:72
char * pattern
printable version
Definition: regex3.h:90
regex_t * regex
compiled expression
Definition: regex3.h:91
+ Here is the caller graph for this function:

◆ lookup_charset()

static const char * lookup_charset ( enum LookupType  type,
const char *  cs 
)
static

Look for a preferred character set name.

Parameters
typeType, e.g. MUTT_LOOKUP_CHARSET
csCharacter set
Return values
ptrCharset string

If the character set matches one of the regexes, then return the replacement name.

Definition at line 281 of file charset.c.

282{
283 if (!cs)
284 return NULL;
285
286 struct Lookup *l = NULL;
287
288 TAILQ_FOREACH(l, &Lookups, entries)
289 {
290 if (l->type != type)
291 continue;
292 if (mutt_regex_match(&l->regex, cs))
293 return l->replacement;
294 }
295 return NULL;
296}
static struct LookupList Lookups
Definition: charset.c:78
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:631
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:725
enum LookupType type
Lookup type.
Definition: charset.c:71
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string ( char **  ps)

Try to convert a string using a list of character sets.

Parameters
[in,out]psString to be converted
Return values
0Success
-1Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 307 of file charset.c.

308{
309 if (!ps)
310 return -1;
311
312 char *u = *ps;
313 const size_t ulen = mutt_str_len(u);
314 if (ulen == 0)
315 return 0;
316
317 const struct Slist *const c_assumed_charset = cs_subset_slist(NeoMutt->sub, "assumed_charset");
318 const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
319 const struct ListNode *np = NULL;
320 STAILQ_FOREACH(np, &c_assumed_charset->head, entries)
321 {
322 char const *c = np->data;
323 size_t n = mutt_str_len(c);
324 char *fromcode = mutt_mem_malloc(n + 1);
325 mutt_str_copy(fromcode, c, n + 1);
326 char *s = mutt_strn_dup(u, ulen);
327 int m = mutt_ch_convert_string(&s, fromcode, c_charset, MUTT_ICONV_NO_FLAGS);
328 FREE(&fromcode);
329 if (m == 0)
330 {
331 FREE(ps);
332 *ps = s;
333 return 0;
334 }
335 FREE(&s);
336 }
338 c_charset, MUTT_ICONV_HOOK_FROM);
339 return -1;
340}
const char * cs_subset_string(const struct ConfigSubset *sub, const char *name)
Get a string config item by name.
Definition: helpers.c:317
const struct Slist * cs_subset_slist(const struct ConfigSubset *sub, const char *name)
Get a string-list config item by name.
Definition: helpers.c:268
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:752
char * mutt_ch_get_default_charset(void)
Get the default character set.
Definition: charset.c:439
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:451
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:567
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:652
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
A List node for strings.
Definition: list.h:35
char * data
String.
Definition: list.h:36
Container for Accounts, Notifications.
Definition: neomutt.h:37
struct ConfigSubset * sub
Inherited config items.
Definition: neomutt.h:39
String list.
Definition: slist.h:47
struct ListHead head
List containing values.
Definition: slist.h:48
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset ( char *  buf,
size_t  buflen,
const char *  name 
)

Canonicalise the charset of a string.

Parameters
bufBuffer for canonical character set name
buflenLength of buffer
nameName to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 351 of file charset.c.

352{
353 if (!buf || !name)
354 return;
355
356 char in[1024], scratch[1024 + 10];
357
358 mutt_str_copy(in, name, sizeof(in));
359 char *ext = strchr(in, '/');
360 if (ext)
361 *ext++ = '\0';
362
363 if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
364 {
365 mutt_str_copy(buf, "utf-8", buflen);
366 goto out;
367 }
368
369 /* catch some common iso-8859-something misspellings */
370 size_t plen;
371 if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
372 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
373 else if ((plen = mutt_istr_startswith(in, "8859-")))
374 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
375 else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
376 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
377 else if ((plen = mutt_istr_startswith(in, "iso8859-")))
378 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
379 else
380 mutt_str_copy(scratch, in, sizeof(scratch));
381
382 for (size_t i = 0; PreferredMimeNames[i].key; i++)
383 {
384 if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
385 {
386 mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
387 goto out;
388 }
389 }
390
391 mutt_str_copy(buf, scratch, buflen);
392
393 /* for cosmetics' sake, transform to lowercase. */
394 for (char *p = buf; *p; p++)
395 *p = tolower(*p);
396
397out:
398 if (ext && *ext)
399 {
400 mutt_str_cat(buf, buflen, "/");
401 mutt_str_cat(buf, buflen, ext);
402 }
403}
const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:99
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:819
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:239
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:265
static size_t plen
Length of cached packet.
Definition: pgppacket.c:39
const char * key
Definition: charset.c:85
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp ( const char *  cs1,
const char *  cs2 
)

Are the names of two character sets equivalent?

Parameters
cs1First character set
cs2Second character set
Return values
trueNames are equivalent
falseNames differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 417 of file charset.c.

418{
419 if (!cs1 || !cs2)
420 return false;
421
422 char buf[256] = { 0 };
423
424 mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
425
426 int len1 = mutt_str_len(buf);
427 int len2 = mutt_str_len(cs2);
428
429 return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
430 ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
431}
#define MIN(a, b)
Definition: memory.h:31
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:351
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:524
+ Here is the call graph for this function:

◆ mutt_ch_get_default_charset()

char * mutt_ch_get_default_charset ( void  )

Get the default character set.

Return values
ptrName of the default character set
Warning
This returns a pointer to a static buffer. Do not free it.

Definition at line 439 of file charset.c.

440{
441 static char fcharset[128];
442 const char *c = NULL;
443 const struct Slist *const c_assumed_charset = cs_subset_slist(NeoMutt->sub, "assumed_charset");
444
445 if (c_assumed_charset && (c_assumed_charset->count > 0))
446 c = STAILQ_FIRST(&c_assumed_charset->head)->data;
447 else
448 c = "us-ascii";
449
450 mutt_str_copy(fcharset, c, sizeof(fcharset));
451 return fcharset;
452}
#define STAILQ_FIRST(head)
Definition: queue.h:350
size_t count
Number of values in list.
Definition: slist.h:49
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char * mutt_ch_get_langinfo_charset ( void  )

Get the user's choice of character set.

Return values
ptrCharset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 461 of file charset.c.

462{
463 char buf[1024] = { 0 };
464
465 mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
466
467 if (buf[0] != '\0')
468 return mutt_str_dup(buf);
469
470 return mutt_str_dup("iso-8859-1");
471}
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:250
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add ( enum LookupType  type,
const char *  pat,
const char *  replace,
struct Buffer err 
)

Add a new character set lookup.

Parameters
typeType of character set, e.g. MUTT_LOOKUP_CHARSET
patPattern to match
replaceReplacement string
errBuffer for error message
Return values
trueLookup added to list
falseRegex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 484 of file charset.c.

486{
487 if (!pat || !replace)
488 return false;
489
490 regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
491 int rc = REG_COMP(rx, pat, REG_ICASE);
492 if (rc != 0)
493 {
494 regerror(rc, rx, err->data, err->dsize);
495 FREE(&rx);
496 return false;
497 }
498
499 struct Lookup *l = lookup_new();
500 l->type = type;
501 l->replacement = mutt_str_dup(replace);
502 l->regex.pattern = mutt_str_dup(pat);
503 l->regex.regex = rx;
504 l->regex.pat_not = false;
505
506 TAILQ_INSERT_TAIL(&Lookups, l, entries);
507
508 return true;
509}
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:247
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:53
size_t dsize
Length of data.
Definition: buffer.h:37
char * data
Pointer to data.
Definition: buffer.h:35
bool pat_not
do not match
Definition: regex3.h:92
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void  )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 516 of file charset.c.

517{
518 struct Lookup *l = NULL;
519 struct Lookup *tmp = NULL;
520
521 TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
522 {
523 TAILQ_REMOVE(&Lookups, l, entries);
524 lookup_free(&l);
525 }
526}
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:256
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char * mutt_ch_charset_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'charset-hook' matches)
NULLNo matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 537 of file charset.c.

538{
540}
static char * chs
Definition: gnupgparse.c:73
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:281
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:67
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open ( const char *  tocode,
const char *  fromcode,
uint8_t  flags 
)

Set up iconv for conversions.

Parameters
tocodeCurrent character set
fromcodeTarget character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptriconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Note
By design charset-hooks should never be, and are never, applied to tocode.
The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 564 of file charset.c.

565{
566 char tocode1[128];
567 char fromcode1[128];
568 const char *tocode2 = NULL, *fromcode2 = NULL;
569 const char *tmp = NULL;
570
571 iconv_t cd;
572
573 /* transform to MIME preferred charset names */
574 mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
575 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
576
577 /* maybe apply charset-hooks and recanonicalise fromcode,
578 * but only when caller asked us to sanitize a potentially wrong
579 * charset name incoming from the wild exterior. */
580 if (flags & MUTT_ICONV_HOOK_FROM)
581 {
582 tmp = mutt_ch_charset_lookup(fromcode1);
583 if (tmp)
584 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
585 }
586
587 /* always apply iconv-hooks to suit system's iconv tastes */
588 tocode2 = mutt_ch_iconv_lookup(tocode1);
589 tocode2 = tocode2 ? tocode2 : tocode1;
590 fromcode2 = mutt_ch_iconv_lookup(fromcode1);
591 fromcode2 = fromcode2 ? fromcode2 : fromcode1;
592
593 /* call system iconv with names it appreciates */
594 cd = iconv_open(tocode2, fromcode2);
595 if (cd != (iconv_t) -1)
596 return cd;
597
598 return (iconv_t) -1;
599}
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:701
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:537
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv ( iconv_t  cd,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft,
const char **  inrepls,
const char *  outrepl,
int *  iconverrno 
)

Change the encoding of a string.

Parameters
[in]cdIconv conversion descriptor
[in,out]inbufBuffer to convert
[in,out]inbytesleftLength of buffer to convert
[in,out]outbufBuffer for the result
[in,out]outbytesleftLength of result buffer
[in]inreplsInput replacement characters
[in]outreplOutput replacement characters
[out]iconverrnoErrno if iconv() fails, 0 if it succeeds
Return values
numCharacters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 617 of file charset.c.

620{
621 size_t rc = 0;
622 const char *ib = *inbuf;
623 size_t ibl = *inbytesleft;
624 char *ob = *outbuf;
625 size_t obl = *outbytesleft;
626
627 while (true)
628 {
629 errno = 0;
630 const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
631 if (ret1 != (size_t) -1)
632 rc += ret1;
633 if (iconverrno)
634 *iconverrno = errno;
635
636 if (ibl && obl && (errno == EILSEQ))
637 {
638 if (inrepls)
639 {
640 /* Try replacing the input */
641 const char **t = NULL;
642 for (t = inrepls; *t; t++)
643 {
644 const char *ib1 = *t;
645 size_t ibl1 = strlen(*t);
646 char *ob1 = ob;
647 size_t obl1 = obl;
648 iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
649 if (ibl1 == 0)
650 {
651 ib++;
652 ibl--;
653 ob = ob1;
654 obl = obl1;
655 rc++;
656 break;
657 }
658 }
659 if (*t)
660 continue;
661 }
662 /* Replace the output */
663 if (!outrepl)
664 outrepl = "?";
665 iconv(cd, NULL, NULL, &ob, &obl);
666 if (obl)
667 {
668 int n = strlen(outrepl);
669 if (n > obl)
670 {
671 outrepl = "?";
672 n = 1;
673 }
674 memcpy(ob, outrepl, n);
675 ib++;
676 ibl--;
677 ob += n;
678 obl -= n;
679 rc++;
680 iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
681 continue;
682 }
683 }
684 *inbuf = ib;
685 *inbytesleft = ibl;
686 *outbuf = ob;
687 *outbytesleft = obl;
688 return rc;
689 }
690}
#define EILSEQ
Definition: charset.c:51
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char * mutt_ch_iconv_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'iconv-hook' matches)
NULLNo matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 701 of file charset.c.

702{
704}
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:68
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check ( const char *  s,
size_t  slen,
const char *  from,
const char *  to 
)

Check whether a string can be converted between encodings.

Parameters
[in]sString to check
[in]slenLength of the string to check
[in]fromCurrent character set
[in]toTarget character set
Return values
0Success
-1Error in iconv_open()
>0Errno as set by iconv()

Definition at line 716 of file charset.c.

717{
718 if (!s || !from || !to)
719 return -1;
720
721 int rc = 0;
722 iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
723 if (cd == (iconv_t) -1)
724 return -1;
725
726 size_t outlen = MB_LEN_MAX * slen;
727 char *out = mutt_mem_malloc(outlen + 1);
728 char *saved_out = out;
729
730 const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
731 if (convlen == (size_t) -1)
732 rc = errno;
733
734 FREE(&saved_out);
735 iconv_close(cd);
736 return rc;
737}
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:564
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string ( char **  ps,
const char *  from,
const char *  to,
uint8_t  flags 
)

Convert a string between encodings.

Parameters
[in,out]psString to convert
[in]fromCurrent character set
[in]toTarget character set
[in]flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
0Success
-1Invalid arguments or failure to open an iconv channel
errnoFailure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 752 of file charset.c.

753{
754 if (!ps)
755 return -1;
756
757 char *s = *ps;
758
759 if (!s || (*s == '\0'))
760 return 0;
761
762 if (!to || !from)
763 return -1;
764
765 const char *repls[] = { "\357\277\275", "?", 0 };
766 int rc = 0;
767
768 iconv_t cd = mutt_ch_iconv_open(to, from, flags);
769 if (cd == (iconv_t) -1)
770 return -1;
771
772 const char **inrepls = NULL;
773 const char *outrepl = NULL;
774
775 if (mutt_ch_is_utf8(to))
776 outrepl = "\357\277\275";
777 else if (mutt_ch_is_utf8(from))
778 inrepls = repls;
779 else
780 outrepl = "?";
781
782 const char *ib = s;
783 size_t ibl = strlen(s);
784 if (ibl >= (SIZE_MAX / MB_LEN_MAX))
785 {
786 iconv_close(cd);
787 return -1;
788 }
789 size_t obl = MB_LEN_MAX * ibl;
790 char *buf = mutt_mem_malloc(obl + 1);
791 char *ob = buf;
792
793 mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
794 iconv(cd, 0, 0, &ob, &obl);
795 iconv_close(cd);
796
797 *ob = '\0';
798
799 FREE(ps);
800 *ps = buf;
801
802 mutt_str_adjust(ps);
803 return rc;
804}
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:617
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:370
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset ( const char *  cs,
bool  strict 
)

Does iconv understand a character set?

Parameters
csCharacter set to check
strictCheck strictly by using iconv
Return values
trueCharacter set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 817 of file charset.c.

818{
819 if (!cs)
820 return false;
821
822 if (mutt_ch_is_utf8(cs))
823 return true;
824
825 if (!strict)
826 {
827 for (int i = 0; PreferredMimeNames[i].key; i++)
828 {
829 if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
831 {
832 return true;
833 }
834 }
835 }
836
837 iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
838 if (cd != (iconv_t) (-1))
839 {
840 iconv_close(cd);
841 return true;
842 }
843
844 return false;
845}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv * mutt_ch_fgetconv_open ( FILE *  fp,
const char *  from,
const char *  to,
uint8_t  flags 
)

Prepare a file for charset conversion.

Parameters
fpFILE ptr to prepare
fromCurrent character set
toDestination character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptrfgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 857 of file charset.c.

858{
859 struct FgetConv *fc = NULL;
860 iconv_t cd = (iconv_t) -1;
861
862 if (from && to)
863 cd = mutt_ch_iconv_open(to, from, flags);
864
865 if (cd != (iconv_t) -1)
866 {
867 static const char *repls[] = { "\357\277\275", "?", 0 };
868
869 fc = mutt_mem_malloc(sizeof(struct FgetConv));
870 fc->p = fc->bufo;
871 fc->ob = fc->bufo;
872 fc->ib = fc->bufi;
873 fc->ibl = 0;
874 fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
875 }
876 else
877 fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
878 fc->fp = fp;
879 fc->cd = cd;
880 return fc;
881}
A dummy converter.
Definition: charset.h:57
Cursor for converting a file's encoding.
Definition: charset.h:41
char bufi[512]
Definition: charset.h:44
iconv_t cd
Definition: charset.h:43
char bufo[512]
Definition: charset.h:45
size_t ibl
Definition: charset.h:49
FILE * fp
Definition: charset.h:42
char * p
Definition: charset.h:46
const char ** inrepls
Definition: charset.h:50
char * ib
Definition: charset.h:48
char * ob
Definition: charset.h:47
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv **  fc)

Close an fgetconv handle.

Parameters
[out]fcfgetconv handle

Definition at line 887 of file charset.c.

888{
889 if (!fc || !*fc)
890 return;
891
892 if ((*fc)->cd != (iconv_t) -1)
893 iconv_close((*fc)->cd);
894 FREE(fc);
895}
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv fc)

Convert a file's character set.

Parameters
fcFgetConv handle
Return values
numNext character in the converted file
EOFError

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 907 of file charset.c.

908{
909 if (!fc)
910 return EOF;
911 if (fc->cd == (iconv_t) -1)
912 return fgetc(fc->fp);
913 if (!fc->p)
914 return EOF;
915 if (fc->p < fc->ob)
916 return (unsigned char) *(fc->p)++;
917
918 /* Try to convert some more */
919 fc->p = fc->bufo;
920 fc->ob = fc->bufo;
921 if (fc->ibl)
922 {
923 size_t obl = sizeof(fc->bufo);
924 iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
925 if (fc->p < fc->ob)
926 return (unsigned char) *(fc->p)++;
927 }
928
929 /* If we trusted iconv a bit more, we would at this point
930 * ask why it had stopped converting ... */
931
932 /* Try to read some more */
933 if ((fc->ibl == sizeof(fc->bufi)) ||
934 (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
935 {
936 fc->p = 0;
937 return EOF;
938 }
939 if (fc->ibl)
940 memcpy(fc->bufi, fc->ib, fc->ibl);
941 fc->ib = fc->bufi;
942 fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
943
944 /* Try harder this time to convert some */
945 if (fc->ibl)
946 {
947 size_t obl = sizeof(fc->bufo);
948 mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
949 fc->inrepls, 0, NULL);
950 if (fc->p < fc->ob)
951 return (unsigned char) *(fc->p)++;
952 }
953
954 /* Either the file has finished or one of the buffers is too small */
955 fc->p = 0;
956 return EOF;
957}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char * mutt_ch_fgetconvs ( char *  buf,
size_t  buflen,
struct FgetConv fc 
)

Convert a file's charset into a string buffer.

Parameters
bufBuffer for result
buflenLength of buffer
fcFgetConv handle
Return values
ptrSuccess, result buffer
NULLError

Read a file into a buffer, converting the character set as it goes.

Definition at line 969 of file charset.c.

970{
971 if (!buf)
972 return NULL;
973
974 size_t r;
975 for (r = 0; (r + 1) < buflen;)
976 {
977 const int c = mutt_ch_fgetconv(fc);
978 if (c == EOF)
979 break;
980 buf[r++] = (char) c;
981 if (c == '\n')
982 break;
983 }
984 buf[r] = '\0';
985
986 if (r > 0)
987 return buf;
988
989 return NULL;
990}
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:907
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char *  charset)

Update the records for a new character set.

Parameters
charsetNew character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note
This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1002 of file charset.c.

1003{
1004 char buf[256] = { 0 };
1005
1006 mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1007
1008 if (mutt_ch_is_utf8(buf))
1009 {
1010 CharsetIsUtf8 = true;
1011 ReplacementChar = 0xfffd; /* replacement character */
1012 }
1013 else
1014 {
1015 CharsetIsUtf8 = false;
1016 ReplacementChar = '?';
1017 }
1018
1019#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1020 bind_textdomain_codeset(PACKAGE, buf);
1021#endif
1022}
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:62
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:57
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_choose()

char * mutt_ch_choose ( const char *  fromcode,
const struct Slist charsets,
const char *  u,
size_t  ulen,
char **  d,
size_t *  dlen 
)

Figure the best charset to encode a string.

Parameters
[in]fromcodeOriginal charset of the string
[in]charsetsList of potential charsets to use
[in]uString to encode
[in]ulenLength of the string to encode
[out]dIf not NULL, point it to the converted string
[out]dlenIf not NULL, point it to the length of the d string
Return values
ptrBest performing charset
NULLNone could be found

Definition at line 1035 of file charset.c.

1037{
1038 if (!fromcode || !charsets)
1039 return NULL;
1040
1041 char *e = NULL, *tocode = NULL;
1042 size_t elen = 0, bestn = 0;
1043
1044 const struct ListNode *np = NULL;
1045 STAILQ_FOREACH(np, &charsets->head, entries)
1046 {
1047 char *t = mutt_str_dup(np->data);
1048 if (!t)
1049 continue;
1050
1051 size_t n = mutt_str_len(t);
1052 char *s = mutt_strn_dup(u, ulen);
1053 const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1054 mutt_ch_check(s, ulen, fromcode, t);
1055 if (rc)
1056 {
1057 FREE(&t);
1058 FREE(&s);
1059 continue;
1060 }
1061 size_t slen = mutt_str_len(s);
1062
1063 if (!tocode || (n < bestn))
1064 {
1065 bestn = n;
1066 FREE(&tocode);
1067 tocode = t;
1068 if (d)
1069 {
1070 FREE(&e);
1071 e = s;
1072 }
1073 else
1074 FREE(&s);
1075 elen = slen;
1076 }
1077 else
1078 {
1079 FREE(&t);
1080 FREE(&s);
1081 }
1082 }
1083 if (tocode)
1084 {
1085 if (d)
1086 *d = e;
1087 if (dlen)
1088 *dlen = elen;
1089
1090 char canonical_buf[1024] = { 0 };
1091 mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1092 mutt_str_replace(&tocode, canonical_buf);
1093 }
1094 return tocode;
1095}
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:716
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:326
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ ReplacementChar

wchar_t ReplacementChar = '?'

When a Unicode character can't be displayed, use this instead.

Definition at line 57 of file charset.c.

◆ CharsetIsUtf8

bool CharsetIsUtf8 = false

Is the user's current character set utf-8?

Definition at line 62 of file charset.c.

◆ Lookups

struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups)
static

Definition at line 78 of file charset.c.

◆ PreferredMimeNames

const struct MimeNames PreferredMimeNames[]

Lookup table of preferred charsets.

The following list has been created manually from the data under: http://www.isi.edu/in-notes/iana/assignments/character-sets Last update: 2000-09-07

Note
It includes only the subset of character sets for which a preferred MIME name is given.

Definition at line 99 of file charset.c.