NeoMutt
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
charset.h File Reference

Conversion between different character encodings. More...

#include <iconv.h>
#include <stdbool.h>
#include <stdint.h>
#include <wchar.h>
+ Include dependency graph for charset.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  FgetConv
 Cursor for converting a file's encoding. More...
 
struct  FgetConvNot
 A dummy converter. More...
 

Macros

#define MUTT_ICONV_NO_FLAGS   0
 No flags are set.
 
#define MUTT_ICONV_HOOK_FROM   1
 apply charset-hooks to fromcode
 
#define mutt_ch_is_utf8(str)   mutt_ch_chscmp(str, "utf-8")
 
#define mutt_ch_is_us_ascii(str)   mutt_ch_chscmp(str, "us-ascii")
 
#define ICONV_T_INVALID   ((iconv_t) -1)
 Error value for iconv functions.
 
#define ICONV_ILLEGAL_SEQ   ((size_t) -1)
 Error value for iconv() - Illegal sequence.
 
#define ICONV_BUF_TOO_SMALL   ((size_t) -2)
 Error value for iconv() - Buffer too small.
 

Enumerations

enum  LookupType { MUTT_LOOKUP_CHARSET , MUTT_LOOKUP_ICONV }
 Types of character set lookups. More...
 

Functions

void mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name)
 Canonicalise the charset of a string.
 
const char * mutt_ch_charset_lookup (const char *chs)
 Look for a replacement character set.
 
int mutt_ch_check (const char *s, size_t slen, const char *from, const char *to)
 Check whether a string can be converted between encodings.
 
bool mutt_ch_check_charset (const char *cs, bool strict)
 Does iconv understand a character set?
 
char * mutt_ch_choose (const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
 Figure the best charset to encode a string.
 
bool mutt_ch_chscmp (const char *cs1, const char *cs2)
 Are the names of two character sets equivalent?
 
int mutt_ch_convert_nonmime_string (const struct Slist *const assumed_charset, const char *charset, char **ps)
 Try to convert a string using a list of character sets.
 
int mutt_ch_convert_string (char **ps, const char *from, const char *to, uint8_t flags)
 Convert a string between encodings.
 
int mutt_ch_fgetconv (struct FgetConv *fc)
 Convert a file's character set.
 
void mutt_ch_fgetconv_close (struct FgetConv **ptr)
 Close an fgetconv handle.
 
struct FgetConvmutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, uint8_t flags)
 Prepare a file for charset conversion.
 
char * mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc)
 Convert a file's charset into a string buffer.
 
const char * mutt_ch_get_default_charset (const struct Slist *const assumed_charset)
 Get the default character set.
 
char * mutt_ch_get_langinfo_charset (void)
 Get the user's choice of character set.
 
size_t mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
 Change the encoding of a string.
 
const char * mutt_ch_iconv_lookup (const char *chs)
 Look for a replacement character set.
 
iconv_t mutt_ch_iconv_open (const char *tocode, const char *fromcode, uint8_t flags)
 Set up iconv for conversions.
 
bool mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
 Add a new character set lookup.
 
void mutt_ch_lookup_remove (void)
 Remove all the character set lookups.
 
void mutt_ch_set_charset (const char *charset)
 Update the records for a new character set.
 
void mutt_ch_cache_cleanup (void)
 Clean up the cached iconv handles and charset strings.
 
static bool iconv_t_valid (const iconv_t cd)
 Is the conversion descriptor valid?
 

Variables

bool CharsetIsUtf8
 Is the user's current character set utf-8?
 
wchar_t ReplacementChar
 When a Unicode character can't be displayed, use this instead.
 

Detailed Description

Conversion between different character encodings.

Authors
  • Thomas Roessler

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.h.

Macro Definition Documentation

◆ MUTT_ICONV_NO_FLAGS

#define MUTT_ICONV_NO_FLAGS   0

No flags are set.

Definition at line 71 of file charset.h.

◆ MUTT_ICONV_HOOK_FROM

#define MUTT_ICONV_HOOK_FROM   1

apply charset-hooks to fromcode

Definition at line 72 of file charset.h.

◆ mutt_ch_is_utf8

#define mutt_ch_is_utf8 (   str)    mutt_ch_chscmp(str, "utf-8")

Definition at line 96 of file charset.h.

◆ mutt_ch_is_us_ascii

#define mutt_ch_is_us_ascii (   str)    mutt_ch_chscmp(str, "us-ascii")

Definition at line 97 of file charset.h.

◆ ICONV_T_INVALID

#define ICONV_T_INVALID   ((iconv_t) -1)

Error value for iconv functions.

Definition at line 100 of file charset.h.

◆ ICONV_ILLEGAL_SEQ

#define ICONV_ILLEGAL_SEQ   ((size_t) -1)

Error value for iconv() - Illegal sequence.

Definition at line 103 of file charset.h.

◆ ICONV_BUF_TOO_SMALL

#define ICONV_BUF_TOO_SMALL   ((size_t) -2)

Error value for iconv() - Buffer too small.

Definition at line 105 of file charset.h.

Enumeration Type Documentation

◆ LookupType

enum LookupType

Types of character set lookups.

Enumerator
MUTT_LOOKUP_CHARSET 

Alias for another character set.

MUTT_LOOKUP_ICONV 

Character set conversion.

Definition at line 65 of file charset.h.

66{
69};
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:68
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:67

Function Documentation

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset ( char *  buf,
size_t  buflen,
const char *  name 
)

Canonicalise the charset of a string.

Parameters
bufBuffer for canonical character set name
buflenLength of buffer
nameName to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 371 of file charset.c.

372{
373 if (!buf || !name)
374 return;
375
376 char in[1024], scratch[1024 + 10];
377
378 mutt_str_copy(in, name, sizeof(in));
379 char *ext = strchr(in, '/');
380 if (ext)
381 *ext++ = '\0';
382
383 if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
384 {
385 mutt_str_copy(buf, "utf-8", buflen);
386 goto out;
387 }
388
389 /* catch some common iso-8859-something misspellings */
390 size_t plen;
391 if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
392 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
393 else if ((plen = mutt_istr_startswith(in, "8859-")))
394 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
395 else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
396 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
397 else if ((plen = mutt_istr_startswith(in, "iso8859-")))
398 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
399 else
400 mutt_str_copy(scratch, in, sizeof(scratch));
401
402 for (size_t i = 0; PreferredMimeNames[i].key; i++)
403 {
404 if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
405 {
406 mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
407 goto out;
408 }
409 }
410
411 mutt_str_copy(buf, scratch, buflen);
412
413 /* for cosmetics' sake, transform to lowercase. */
414 for (char *p = buf; *p; p++)
415 *p = tolower(*p);
416
417out:
418 if (ext && *ext)
419 {
420 mutt_str_cat(buf, buflen, "/");
421 mutt_str_cat(buf, buflen, ext);
422 }
423}
static const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:118
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:810
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:653
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:240
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:266
const char * key
Definition: charset.c:104
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char * mutt_ch_charset_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'charset-hook' matches)
NULLNo matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 557 of file charset.c.

558{
560}
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:300
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check ( const char *  s,
size_t  slen,
const char *  from,
const char *  to 
)

Check whether a string can be converted between encodings.

Parameters
[in]sString to check
[in]slenLength of the string to check
[in]fromCurrent character set
[in]toTarget character set
Return values
0Success
-1Error in iconv_open()
>0Errno as set by iconv()

Definition at line 791 of file charset.c.

792{
793 if (!s || !from || !to)
794 return -1;
795
796 int rc = 0;
797 iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
798 if (!iconv_t_valid(cd))
799 return -1;
800
801 size_t outlen = MB_LEN_MAX * slen;
802 char *out = mutt_mem_malloc(outlen + 1);
803 char *saved_out = out;
804
805 const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
806 if (convlen == ICONV_ILLEGAL_SEQ)
807 rc = errno;
808
809 FREE(&saved_out);
810 return rc;
811}
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
#define FREE(x)
Definition: memory.h:45
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:589
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition: charset.h:103
static bool iconv_t_valid(const iconv_t cd)
Is the conversion descriptor valid?
Definition: charset.h:112
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset ( const char *  cs,
bool  strict 
)

Does iconv understand a character set?

Parameters
csCharacter set to check
strictCheck strictly by using iconv
Return values
trueCharacter set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 889 of file charset.c.

890{
891 if (!cs)
892 return false;
893
894 if (mutt_ch_is_utf8(cs))
895 return true;
896
897 if (!strict)
898 {
899 for (int i = 0; PreferredMimeNames[i].key; i++)
900 {
901 if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
903 {
904 return true;
905 }
906 }
907 }
908
909 iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
910 if (iconv_t_valid(cd))
911 {
912 return true;
913 }
914
915 return false;
916}
#define mutt_ch_is_utf8(str)
Definition: charset.h:96
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_choose()

char * mutt_ch_choose ( const char *  fromcode,
const struct Slist charsets,
const char *  u,
size_t  ulen,
char **  d,
size_t *  dlen 
)

Figure the best charset to encode a string.

Parameters
[in]fromcodeOriginal charset of the string
[in]charsetsList of potential charsets to use
[in]uString to encode
[in]ulenLength of the string to encode
[out]dIf not NULL, point it to the converted string
[out]dlenIf not NULL, point it to the length of the d string
Return values
ptrBest performing charset
NULLNone could be found

Definition at line 1106 of file charset.c.

1108{
1109 if (!fromcode || !charsets)
1110 return NULL;
1111
1112 char *e = NULL, *tocode = NULL;
1113 size_t elen = 0, bestn = 0;
1114
1115 const struct ListNode *np = NULL;
1116 STAILQ_FOREACH(np, &charsets->head, entries)
1117 {
1118 char *t = mutt_str_dup(np->data);
1119 if (!t)
1120 continue;
1121
1122 size_t n = mutt_str_len(t);
1123 char *s = mutt_strn_dup(u, ulen);
1124 const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1125 mutt_ch_check(s, ulen, fromcode, t);
1126 if (rc)
1127 {
1128 FREE(&t);
1129 FREE(&s);
1130 continue;
1131 }
1132 size_t slen = mutt_str_len(s);
1133
1134 if (!tocode || (n < bestn))
1135 {
1136 bestn = n;
1137 FREE(&tocode);
1138 tocode = t;
1139 if (d)
1140 {
1141 FREE(&e);
1142 e = s;
1143 }
1144 else
1145 {
1146 FREE(&s);
1147 }
1148 elen = slen;
1149 }
1150 else
1151 {
1152 FREE(&t);
1153 FREE(&s);
1154 }
1155 }
1156 if (tocode)
1157 {
1158 if (d)
1159 *d = e;
1160 if (dlen)
1161 *dlen = elen;
1162
1163 char canonical_buf[1024] = { 0 };
1164 mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1165 mutt_str_replace(&tocode, canonical_buf);
1166 }
1167 return tocode;
1168}
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:371
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:826
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:791
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:452
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:251
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:568
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:327
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
A List node for strings.
Definition: list.h:35
char * data
String.
Definition: list.h:36
struct ListHead head
List containing values.
Definition: slist.h:48
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp ( const char *  cs1,
const char *  cs2 
)

Are the names of two character sets equivalent?

Parameters
cs1First character set
cs2Second character set
Return values
trueNames are equivalent
falseNames differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 437 of file charset.c.

438{
439 if (!cs1 || !cs2)
440 return false;
441
442 char buf[256] = { 0 };
443
444 mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
445
446 int len1 = mutt_str_len(buf);
447 int len2 = mutt_str_len(cs2);
448
449 return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
450 ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
451}
#define MIN(a, b)
Definition: memory.h:32
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:525
+ Here is the call graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string ( const struct Slist *const  assumed_charset,
const char *  charset,
char **  ps 
)

Try to convert a string using a list of character sets.

Parameters
[in]assumed_charsetFrom $assumed_charset
[in]charsetFrom $charset
[in,out]psString to be converted
Return values
0Success
-1Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 328 of file charset.c.

330{
331 if (!ps)
332 return -1;
333
334 char *u = *ps;
335 const size_t ulen = mutt_str_len(u);
336 if (ulen == 0)
337 return 0;
338
339 const struct ListNode *np = NULL;
340 STAILQ_FOREACH(np, &assumed_charset->head, entries)
341 {
342 char const *c = np->data;
343 size_t n = mutt_str_len(c);
344 char *fromcode = mutt_mem_malloc(n + 1);
345 mutt_str_copy(fromcode, c, n + 1);
346 char *s = mutt_strn_dup(u, ulen);
347 int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);
348 FREE(&fromcode);
349 if (m == 0)
350 {
351 FREE(ps);
352 *ps = s;
353 return 0;
354 }
355 FREE(&s);
356 }
358 charset, MUTT_ICONV_HOOK_FROM);
359 return -1;
360}
const char * mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
Get the default character set.
Definition: charset.c:460
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string ( char **  ps,
const char *  from,
const char *  to,
uint8_t  flags 
)

Convert a string between encodings.

Parameters
[in,out]psString to convert
[in]fromCurrent character set
[in]toTarget character set
[in]flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
0Success
-1Invalid arguments or failure to open an iconv channel
errnoFailure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 826 of file charset.c.

827{
828 if (!ps)
829 return -1;
830
831 char *s = *ps;
832
833 if (!s || (*s == '\0'))
834 return 0;
835
836 if (!to || !from)
837 return -1;
838
839 const char *repls[] = { "\357\277\275", "?", 0 };
840 int rc = 0;
841
842 iconv_t cd = mutt_ch_iconv_open(to, from, flags);
843 if (!iconv_t_valid(cd))
844 return -1;
845
846 const char **inrepls = NULL;
847 const char *outrepl = NULL;
848
849 if (mutt_ch_is_utf8(to))
850 outrepl = "\357\277\275";
851 else if (mutt_ch_is_utf8(from))
852 inrepls = repls;
853 else
854 outrepl = "?";
855
856 const char *ib = s;
857 size_t ibl = strlen(s);
858 if (ibl >= (SIZE_MAX / MB_LEN_MAX))
859 {
860 return -1;
861 }
862 size_t obl = MB_LEN_MAX * ibl;
863 char *buf = mutt_mem_malloc(obl + 1);
864 char *ob = buf;
865
866 mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
867 iconv(cd, 0, 0, &ob, &obl);
868
869 *ob = '\0';
870
871 FREE(ps);
872 *ps = buf;
873
874 mutt_str_adjust(ps);
875 return rc;
876}
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:692
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:371
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv fc)

Convert a file's character set.

Parameters
fcFgetConv handle
Return values
numNext character in the converted file
EOFError

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 978 of file charset.c.

979{
980 if (!fc)
981 return EOF;
982 if (!iconv_t_valid(fc->cd))
983 return fgetc(fc->fp);
984 if (!fc->p)
985 return EOF;
986 if (fc->p < fc->ob)
987 return (unsigned char) *(fc->p)++;
988
989 /* Try to convert some more */
990 fc->p = fc->bufo;
991 fc->ob = fc->bufo;
992 if (fc->ibl)
993 {
994 size_t obl = sizeof(fc->bufo);
995 iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
996 if (fc->p < fc->ob)
997 return (unsigned char) *(fc->p)++;
998 }
999
1000 /* If we trusted iconv a bit more, we would at this point
1001 * ask why it had stopped converting ... */
1002
1003 /* Try to read some more */
1004 if ((fc->ibl == sizeof(fc->bufi)) ||
1005 (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
1006 {
1007 fc->p = 0;
1008 return EOF;
1009 }
1010 if (fc->ibl)
1011 memcpy(fc->bufi, fc->ib, fc->ibl);
1012 fc->ib = fc->bufi;
1013 fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
1014
1015 /* Try harder this time to convert some */
1016 if (fc->ibl)
1017 {
1018 size_t obl = sizeof(fc->bufo);
1019 mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
1020 fc->inrepls, 0, NULL);
1021 if (fc->p < fc->ob)
1022 return (unsigned char) *(fc->p)++;
1023 }
1024
1025 /* Either the file has finished or one of the buffers is too small */
1026 fc->p = 0;
1027 return EOF;
1028}
char bufi[512]
Definition: charset.h:44
iconv_t cd
iconv conversion descriptor
Definition: charset.h:43
char bufo[512]
Definition: charset.h:45
size_t ibl
Definition: charset.h:49
FILE * fp
Definition: charset.h:42
char * p
Definition: charset.h:46
const char ** inrepls
Definition: charset.h:50
char * ib
Definition: charset.h:48
char * ob
Definition: charset.h:47
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv **  ptr)

Close an fgetconv handle.

Parameters
[out]ptrfgetconv handle

Definition at line 960 of file charset.c.

961{
962 if (!ptr || !*ptr)
963 return;
964
965 FREE(ptr);
966}
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv * mutt_ch_fgetconv_open ( FILE *  fp,
const char *  from,
const char *  to,
uint8_t  flags 
)

Prepare a file for charset conversion.

Parameters
fpFILE ptr to prepare
fromCurrent character set
toDestination character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptrfgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 928 of file charset.c.

929{
930 struct FgetConv *fc = NULL;
931 iconv_t cd = ICONV_T_INVALID;
932
933 if (from && to)
934 cd = mutt_ch_iconv_open(to, from, flags);
935
936 if (iconv_t_valid(cd))
937 {
938 static const char *repls[] = { "\357\277\275", "?", 0 };
939
940 fc = mutt_mem_malloc(sizeof(struct FgetConv));
941 fc->p = fc->bufo;
942 fc->ob = fc->bufo;
943 fc->ib = fc->bufi;
944 fc->ibl = 0;
945 fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
946 }
947 else
948 {
949 fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
950 }
951 fc->fp = fp;
952 fc->cd = cd;
953 return fc;
954}
#define ICONV_T_INVALID
Error value for iconv functions.
Definition: charset.h:100
A dummy converter.
Definition: charset.h:57
Cursor for converting a file's encoding.
Definition: charset.h:41
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char * mutt_ch_fgetconvs ( char *  buf,
size_t  buflen,
struct FgetConv fc 
)

Convert a file's charset into a string buffer.

Parameters
bufBuffer for result
buflenLength of buffer
fcFgetConv handle
Return values
ptrSuccess, result buffer
NULLError

Read a file into a buffer, converting the character set as it goes.

Definition at line 1040 of file charset.c.

1041{
1042 if (!buf)
1043 return NULL;
1044
1045 size_t r;
1046 for (r = 0; (r + 1) < buflen;)
1047 {
1048 const int c = mutt_ch_fgetconv(fc);
1049 if (c == EOF)
1050 break;
1051 buf[r++] = (char) c;
1052 if (c == '\n')
1053 break;
1054 }
1055 buf[r] = '\0';
1056
1057 if (r > 0)
1058 return buf;
1059
1060 return NULL;
1061}
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:978
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_default_charset()

const char * mutt_ch_get_default_charset ( const struct Slist *const  assumed_charset)

Get the default character set.

Parameters
assumed_charsetFrom $assumed_charset
Return values
ptrName of the default character set
Warning
This returns a pointer to a static buffer. Do not free it.

Definition at line 460 of file charset.c.

461{
462 static char fcharset[128];
463 const char *c = NULL;
464
465 if (assumed_charset && (assumed_charset->count > 0))
466 c = STAILQ_FIRST(&assumed_charset->head)->data;
467 else
468 c = "us-ascii";
469
470 mutt_str_copy(fcharset, c, sizeof(fcharset));
471 return fcharset;
472}
#define STAILQ_FIRST(head)
Definition: queue.h:350
size_t count
Number of values in list.
Definition: slist.h:49
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char * mutt_ch_get_langinfo_charset ( void  )

Get the user's choice of character set.

Return values
ptrCharset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 481 of file charset.c.

482{
483 char buf[1024] = { 0 };
484
485 mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
486
487 if (buf[0] != '\0')
488 return mutt_str_dup(buf);
489
490 return mutt_str_dup("iso-8859-1");
491}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv ( iconv_t  cd,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft,
const char **  inrepls,
const char *  outrepl,
int *  iconverrno 
)

Change the encoding of a string.

Parameters
[in]cdIconv conversion descriptor
[in,out]inbufBuffer to convert
[in,out]inbytesleftLength of buffer to convert
[in,out]outbufBuffer for the result
[in,out]outbytesleftLength of result buffer
[in]inreplsInput replacement characters
[in]outreplOutput replacement characters
[out]iconverrnoErrno if iconv() fails, 0 if it succeeds
Return values
numCharacters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 692 of file charset.c.

695{
696 size_t rc = 0;
697 const char *ib = *inbuf;
698 size_t ibl = *inbytesleft;
699 char *ob = *outbuf;
700 size_t obl = *outbytesleft;
701
702 while (true)
703 {
704 errno = 0;
705 const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
706 if (ret1 != ICONV_ILLEGAL_SEQ)
707 rc += ret1;
708 if (iconverrno)
709 *iconverrno = errno;
710
711 if (ibl && obl && (errno == EILSEQ))
712 {
713 if (inrepls)
714 {
715 /* Try replacing the input */
716 const char **t = NULL;
717 for (t = inrepls; *t; t++)
718 {
719 const char *ib1 = *t;
720 size_t ibl1 = strlen(*t);
721 char *ob1 = ob;
722 size_t obl1 = obl;
723 iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
724 if (ibl1 == 0)
725 {
726 ib++;
727 ibl--;
728 ob = ob1;
729 obl = obl1;
730 rc++;
731 break;
732 }
733 }
734 if (*t)
735 continue;
736 }
737 /* Replace the output */
738 if (!outrepl)
739 outrepl = "?";
740 iconv(cd, NULL, NULL, &ob, &obl);
741 if (obl)
742 {
743 int n = strlen(outrepl);
744 if (n > obl)
745 {
746 outrepl = "?";
747 n = 1;
748 }
749 memcpy(ob, outrepl, n);
750 ib++;
751 ibl--;
752 ob += n;
753 obl -= n;
754 rc++;
755 iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
756 continue;
757 }
758 }
759 *inbuf = ib;
760 *inbytesleft = ibl;
761 *outbuf = ob;
762 *outbytesleft = obl;
763 return rc;
764 }
765}
#define EILSEQ
Definition: charset.c:52
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char * mutt_ch_iconv_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'iconv-hook' matches)
NULLNo matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 776 of file charset.c.

777{
779}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open ( const char *  tocode,
const char *  fromcode,
uint8_t  flags 
)

Set up iconv for conversions.

Parameters
tocodeCurrent character set
fromcodeTarget character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptriconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Since calling iconv_open() repeatedly can be expensive, we keep a cache of the most recently used iconv_t objects, kept in LRU order. This means that you should not call iconv_close() on the object yourself. All remaining objects in the cache will exit when main() calls mutt_ch_cache_cleanup().

Note
By design charset-hooks should never be, and are never, applied to tocode.
The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 589 of file charset.c.

590{
591 char tocode1[128];
592 char fromcode1[128];
593 const char *tocode2 = NULL, *fromcode2 = NULL;
594 const char *tmp = NULL;
595
596 /* transform to MIME preferred charset names */
597 mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
598 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
599
600 /* maybe apply charset-hooks and recanonicalise fromcode,
601 * but only when caller asked us to sanitize a potentially wrong
602 * charset name incoming from the wild exterior. */
603 if (flags & MUTT_ICONV_HOOK_FROM)
604 {
605 tmp = mutt_ch_charset_lookup(fromcode1);
606 if (tmp)
607 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
608 }
609
610 /* check if we have this pair cached already */
611 for (int i = 0; i < IconvCacheUsed; ++i)
612 {
613 if (strcmp(tocode1, IconvCache[i].tocode1) == 0 &&
614 strcmp(fromcode1, IconvCache[i].fromcode1) == 0)
615 {
616 iconv_t cd = IconvCache[i].cd;
617
618 /* make room for this one at the top */
619 struct IconvCacheEntry top = IconvCache[i];
620 for (int j = i; j-- > 0;)
621 {
622 IconvCache[j + 1] = IconvCache[j];
623 }
624 IconvCache[0] = top;
625
626 if (iconv_t_valid(cd))
627 {
628 /* reset state */
629 iconv(cd, NULL, NULL, NULL, NULL);
630 }
631 return cd;
632 }
633 }
634
635 /* not found in cache */
636 /* always apply iconv-hooks to suit system's iconv tastes */
637 tocode2 = mutt_ch_iconv_lookup(tocode1);
638 tocode2 = tocode2 ? tocode2 : tocode1;
639 fromcode2 = mutt_ch_iconv_lookup(fromcode1);
640 fromcode2 = fromcode2 ? fromcode2 : fromcode1;
641
642 /* call system iconv with names it appreciates */
643 iconv_t cd = iconv_open(tocode2, fromcode2);
644
646 {
647 mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n",
650 /* get rid of the oldest entry */
654 {
655 iconv_close(IconvCache[IconvCacheUsed - 1].cd);
656 }
658 }
659
660 /* make room for this one at the top */
661 for (int j = IconvCacheUsed; j-- > 0;)
662 {
663 IconvCache[j + 1] = IconvCache[j];
664 }
665
667
668 mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1);
669 IconvCache[0].fromcode1 = strdup(fromcode1);
670 IconvCache[0].tocode1 = strdup(tocode1);
671 IconvCache[0].cd = cd;
672
673 return cd;
674}
#define mutt_debug(LEVEL,...)
Definition: logging2.h:89
@ LL_DEBUG2
Log at debug level 2.
Definition: logging2.h:44
static int IconvCacheUsed
Number of iconv descriptors in the cache.
Definition: charset.c:97
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:776
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:557
#define ICONV_CACHE_SIZE
Max size of the iconv cache.
Definition: charset.c:93
static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
Cache of iconv conversion descriptors.
Definition: charset.c:95
Cached iconv conversion descriptor.
Definition: charset.c:86
char * tocode1
Destination character set.
Definition: charset.c:88
char * fromcode1
Source character set.
Definition: charset.c:87
iconv_t cd
iconv conversion descriptor
Definition: charset.c:89
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add ( enum LookupType  type,
const char *  pat,
const char *  replace,
struct Buffer err 
)

Add a new character set lookup.

Parameters
typeType of character set, e.g. MUTT_LOOKUP_CHARSET
patPattern to match
replaceReplacement string
errBuffer for error message
Return values
trueLookup added to list
falseRegex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 504 of file charset.c.

506{
507 if (!pat || !replace)
508 return false;
509
510 regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
511 int rc = REG_COMP(rx, pat, REG_ICASE);
512 if (rc != 0)
513 {
514 regerror(rc, rx, err->data, err->dsize);
515 FREE(&rx);
516 return false;
517 }
518
519 struct Lookup *l = lookup_new();
520 l->type = type;
521 l->replacement = mutt_str_dup(replace);
522 l->regex.pattern = mutt_str_dup(pat);
523 l->regex.regex = rx;
524 l->regex.pat_not = false;
525
526 TAILQ_INSERT_TAIL(&Lookups, l, entries);
527
528 return true;
529}
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
static struct LookupList Lookups
Lookup table of preferred character set names.
Definition: charset.c:80
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:266
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:53
size_t dsize
Length of data.
Definition: buffer.h:37
char * data
Pointer to data.
Definition: buffer.h:35
Regex to String lookup table.
Definition: charset.c:71
char * replacement
Alternative charset to use.
Definition: charset.c:74
enum LookupType type
Lookup type.
Definition: charset.c:72
struct Regex regex
Regular expression.
Definition: charset.c:73
char * pattern
printable version
Definition: regex3.h:90
bool pat_not
do not match
Definition: regex3.h:92
regex_t * regex
compiled expression
Definition: regex3.h:91
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void  )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 536 of file charset.c.

537{
538 struct Lookup *l = NULL;
539 struct Lookup *tmp = NULL;
540
541 TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
542 {
543 TAILQ_REMOVE(&Lookups, l, entries);
544 lookup_free(&l);
545 }
546}
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:275
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char *  charset)

Update the records for a new character set.

Parameters
charsetNew character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note
This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1073 of file charset.c.

1074{
1075 char buf[256] = { 0 };
1076
1077 mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1078
1079 if (mutt_ch_is_utf8(buf))
1080 {
1081 CharsetIsUtf8 = true;
1082 ReplacementChar = 0xfffd; /* replacement character */
1083 }
1084 else
1085 {
1086 CharsetIsUtf8 = false;
1087 ReplacementChar = '?';
1088 }
1089
1090#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1091 bind_textdomain_codeset(PACKAGE, buf);
1092#endif
1093}
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:63
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:58
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_cache_cleanup()

void mutt_ch_cache_cleanup ( void  )

Clean up the cached iconv handles and charset strings.

Definition at line 1173 of file charset.c.

1174{
1175 for (int i = 0; i < IconvCacheUsed; ++i)
1176 {
1177 FREE(&IconvCache[i].fromcode1);
1178 FREE(&IconvCache[i].tocode1);
1179 if (iconv_t_valid(IconvCache[i].cd))
1180 {
1181 iconv_close(IconvCache[i].cd);
1182 }
1183 }
1184 IconvCacheUsed = 0;
1185}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ iconv_t_valid()

static bool iconv_t_valid ( const iconv_t  cd)
inlinestatic

Is the conversion descriptor valid?

Parameters
cdConversion descriptor to test
Return values
trueIt's valid

Definition at line 112 of file charset.h.

113{
114 return cd != ICONV_T_INVALID;
115}
+ Here is the caller graph for this function:

Variable Documentation

◆ CharsetIsUtf8

bool CharsetIsUtf8
extern

Is the user's current character set utf-8?

Definition at line 63 of file charset.c.

◆ ReplacementChar

wchar_t ReplacementChar
extern

When a Unicode character can't be displayed, use this instead.

Definition at line 58 of file charset.c.