NeoMutt  2023-03-22-27-g3cb248
Teaching an old dog new tricks
DOXYGEN
charset.h File Reference

Conversion between different character encodings. More...

#include <iconv.h>
#include <stdbool.h>
#include <stdint.h>
#include <wchar.h>
+ Include dependency graph for charset.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  FgetConv
 Cursor for converting a file's encoding. More...
 
struct  FgetConvNot
 A dummy converter. More...
 

Macros

#define MUTT_ICONV_NO_FLAGS   0
 No flags are set. More...
 
#define MUTT_ICONV_HOOK_FROM   1
 apply charset-hooks to fromcode More...
 
#define mutt_ch_is_utf8(str)   mutt_ch_chscmp(str, "utf-8")
 
#define mutt_ch_is_us_ascii(str)   mutt_ch_chscmp(str, "us-ascii")
 

Enumerations

enum  LookupType { MUTT_LOOKUP_CHARSET , MUTT_LOOKUP_ICONV }
 Types of character set lookups. More...
 

Functions

void mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name)
 Canonicalise the charset of a string. More...
 
const char * mutt_ch_charset_lookup (const char *chs)
 Look for a replacement character set. More...
 
int mutt_ch_check (const char *s, size_t slen, const char *from, const char *to)
 Check whether a string can be converted between encodings. More...
 
bool mutt_ch_check_charset (const char *cs, bool strict)
 Does iconv understand a character set? More...
 
char * mutt_ch_choose (const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
 Figure the best charset to encode a string. More...
 
bool mutt_ch_chscmp (const char *cs1, const char *cs2)
 Are the names of two character sets equivalent? More...
 
int mutt_ch_convert_nonmime_string (const struct Slist *const assumed_charset, const char *charset, char **ps)
 Try to convert a string using a list of character sets. More...
 
int mutt_ch_convert_string (char **ps, const char *from, const char *to, uint8_t flags)
 Convert a string between encodings. More...
 
int mutt_ch_fgetconv (struct FgetConv *fc)
 Convert a file's character set. More...
 
void mutt_ch_fgetconv_close (struct FgetConv **fc)
 Close an fgetconv handle. More...
 
struct FgetConvmutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, uint8_t flags)
 Prepare a file for charset conversion. More...
 
char * mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc)
 Convert a file's charset into a string buffer. More...
 
const char * mutt_ch_get_default_charset (const struct Slist *const assumed_charset)
 Get the default character set. More...
 
char * mutt_ch_get_langinfo_charset (void)
 Get the user's choice of character set. More...
 
size_t mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
 Change the encoding of a string. More...
 
const char * mutt_ch_iconv_lookup (const char *chs)
 Look for a replacement character set. More...
 
iconv_t mutt_ch_iconv_open (const char *tocode, const char *fromcode, uint8_t flags)
 Set up iconv for conversions. More...
 
bool mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
 Add a new character set lookup. More...
 
void mutt_ch_lookup_remove (void)
 Remove all the character set lookups. More...
 
void mutt_ch_set_charset (const char *charset)
 Update the records for a new character set. More...
 

Variables

bool CharsetIsUtf8
 Is the user's current character set utf-8? More...
 
wchar_t ReplacementChar
 When a Unicode character can't be displayed, use this instead. More...
 

Detailed Description

Conversion between different character encodings.

Authors
  • Thomas Roessler

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.h.

Macro Definition Documentation

◆ MUTT_ICONV_NO_FLAGS

#define MUTT_ICONV_NO_FLAGS   0

No flags are set.

Definition at line 71 of file charset.h.

◆ MUTT_ICONV_HOOK_FROM

#define MUTT_ICONV_HOOK_FROM   1

apply charset-hooks to fromcode

Definition at line 72 of file charset.h.

◆ mutt_ch_is_utf8

#define mutt_ch_is_utf8 (   str)    mutt_ch_chscmp(str, "utf-8")

Definition at line 95 of file charset.h.

◆ mutt_ch_is_us_ascii

#define mutt_ch_is_us_ascii (   str)    mutt_ch_chscmp(str, "us-ascii")

Definition at line 96 of file charset.h.

Enumeration Type Documentation

◆ LookupType

enum LookupType

Types of character set lookups.

Enumerator
MUTT_LOOKUP_CHARSET 

Alias for another character set.

MUTT_LOOKUP_ICONV 

Character set conversion.

Definition at line 65 of file charset.h.

66{
69};
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:68
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:67

Function Documentation

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset ( char *  buf,
size_t  buflen,
const char *  name 
)

Canonicalise the charset of a string.

Parameters
bufBuffer for canonical character set name
buflenLength of buffer
nameName to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 350 of file charset.c.

351{
352 if (!buf || !name)
353 return;
354
355 char in[1024], scratch[1024 + 10];
356
357 mutt_str_copy(in, name, sizeof(in));
358 char *ext = strchr(in, '/');
359 if (ext)
360 *ext++ = '\0';
361
362 if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
363 {
364 mutt_str_copy(buf, "utf-8", buflen);
365 goto out;
366 }
367
368 /* catch some common iso-8859-something misspellings */
369 size_t plen;
370 if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
371 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
372 else if ((plen = mutt_istr_startswith(in, "8859-")))
373 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
374 else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
375 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
376 else if ((plen = mutt_istr_startswith(in, "iso8859-")))
377 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
378 else
379 mutt_str_copy(scratch, in, sizeof(scratch));
380
381 for (size_t i = 0; PreferredMimeNames[i].key; i++)
382 {
383 if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
384 {
385 mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
386 goto out;
387 }
388 }
389
390 mutt_str_copy(buf, scratch, buflen);
391
392 /* for cosmetics' sake, transform to lowercase. */
393 for (char *p = buf; *p; p++)
394 *p = tolower(*p);
395
396out:
397 if (ext && *ext)
398 {
399 mutt_str_cat(buf, buflen, "/");
400 mutt_str_cat(buf, buflen, ext);
401 }
402}
const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:97
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:819
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:652
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:239
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:265
static size_t plen
Length of cached packet.
Definition: pgppacket.c:39
const char * key
Definition: charset.c:83
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char * mutt_ch_charset_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'charset-hook' matches)
NULLNo matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 536 of file charset.c.

537{
539}
static char * chs
Definition: gnupgparse.c:73
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:279
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check ( const char *  s,
size_t  slen,
const char *  from,
const char *  to 
)

Check whether a string can be converted between encodings.

Parameters
[in]sString to check
[in]slenLength of the string to check
[in]fromCurrent character set
[in]toTarget character set
Return values
0Success
-1Error in iconv_open()
>0Errno as set by iconv()

Definition at line 715 of file charset.c.

716{
717 if (!s || !from || !to)
718 return -1;
719
720 int rc = 0;
721 iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
722 if (cd == (iconv_t) -1)
723 return -1;
724
725 size_t outlen = MB_LEN_MAX * slen;
726 char *out = mutt_mem_malloc(outlen + 1);
727 char *saved_out = out;
728
729 const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
730 if (convlen == (size_t) -1)
731 rc = errno;
732
733 FREE(&saved_out);
734 iconv_close(cd);
735 return rc;
736}
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
#define FREE(x)
Definition: memory.h:43
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:563
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset ( const char *  cs,
bool  strict 
)

Does iconv understand a character set?

Parameters
csCharacter set to check
strictCheck strictly by using iconv
Return values
trueCharacter set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 816 of file charset.c.

817{
818 if (!cs)
819 return false;
820
821 if (mutt_ch_is_utf8(cs))
822 return true;
823
824 if (!strict)
825 {
826 for (int i = 0; PreferredMimeNames[i].key; i++)
827 {
828 if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
830 {
831 return true;
832 }
833 }
834 }
835
836 iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
837 if (cd != (iconv_t) (-1))
838 {
839 iconv_close(cd);
840 return true;
841 }
842
843 return false;
844}
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_choose()

char * mutt_ch_choose ( const char *  fromcode,
const struct Slist charsets,
const char *  u,
size_t  ulen,
char **  d,
size_t *  dlen 
)

Figure the best charset to encode a string.

Parameters
[in]fromcodeOriginal charset of the string
[in]charsetsList of potential charsets to use
[in]uString to encode
[in]ulenLength of the string to encode
[out]dIf not NULL, point it to the converted string
[out]dlenIf not NULL, point it to the length of the d string
Return values
ptrBest performing charset
NULLNone could be found

Definition at line 1036 of file charset.c.

1038{
1039 if (!fromcode || !charsets)
1040 return NULL;
1041
1042 char *e = NULL, *tocode = NULL;
1043 size_t elen = 0, bestn = 0;
1044
1045 const struct ListNode *np = NULL;
1046 STAILQ_FOREACH(np, &charsets->head, entries)
1047 {
1048 char *t = mutt_str_dup(np->data);
1049 if (!t)
1050 continue;
1051
1052 size_t n = mutt_str_len(t);
1053 char *s = mutt_strn_dup(u, ulen);
1054 const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1055 mutt_ch_check(s, ulen, fromcode, t);
1056 if (rc)
1057 {
1058 FREE(&t);
1059 FREE(&s);
1060 continue;
1061 }
1062 size_t slen = mutt_str_len(s);
1063
1064 if (!tocode || (n < bestn))
1065 {
1066 bestn = n;
1067 FREE(&tocode);
1068 tocode = t;
1069 if (d)
1070 {
1071 FREE(&e);
1072 e = s;
1073 }
1074 else
1075 {
1076 FREE(&s);
1077 }
1078 elen = slen;
1079 }
1080 else
1081 {
1082 FREE(&t);
1083 FREE(&s);
1084 }
1085 }
1086 if (tocode)
1087 {
1088 if (d)
1089 *d = e;
1090 if (dlen)
1091 *dlen = elen;
1092
1093 char canonical_buf[1024] = { 0 };
1094 mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1095 mutt_str_replace(&tocode, canonical_buf);
1096 }
1097 return tocode;
1098}
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:350
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:751
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:715
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:451
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:250
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:567
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:326
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
A List node for strings.
Definition: list.h:35
char * data
String.
Definition: list.h:36
struct ListHead head
List containing values.
Definition: slist.h:48
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp ( const char *  cs1,
const char *  cs2 
)

Are the names of two character sets equivalent?

Parameters
cs1First character set
cs2Second character set
Return values
trueNames are equivalent
falseNames differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 416 of file charset.c.

417{
418 if (!cs1 || !cs2)
419 return false;
420
421 char buf[256] = { 0 };
422
423 mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
424
425 int len1 = mutt_str_len(buf);
426 int len2 = mutt_str_len(cs2);
427
428 return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
429 ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
430}
#define MIN(a, b)
Definition: memory.h:31
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:524
+ Here is the call graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string ( const struct Slist *const  assumed_charset,
const char *  charset,
char **  ps 
)

Try to convert a string using a list of character sets.

Parameters
[in]assumed_charsetFrom $assumed_charset
[in]charsetFrom $charset
[in,out]psString to be converted
Return values
0Success
-1Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 307 of file charset.c.

309{
310 if (!ps)
311 return -1;
312
313 char *u = *ps;
314 const size_t ulen = mutt_str_len(u);
315 if (ulen == 0)
316 return 0;
317
318 const struct ListNode *np = NULL;
319 STAILQ_FOREACH(np, &assumed_charset->head, entries)
320 {
321 char const *c = np->data;
322 size_t n = mutt_str_len(c);
323 char *fromcode = mutt_mem_malloc(n + 1);
324 mutt_str_copy(fromcode, c, n + 1);
325 char *s = mutt_strn_dup(u, ulen);
326 int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);
327 FREE(&fromcode);
328 if (m == 0)
329 {
330 FREE(ps);
331 *ps = s;
332 return 0;
333 }
334 FREE(&s);
335 }
337 charset, MUTT_ICONV_HOOK_FROM);
338 return -1;
339}
const char * mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
Get the default character set.
Definition: charset.c:439
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string ( char **  ps,
const char *  from,
const char *  to,
uint8_t  flags 
)

Convert a string between encodings.

Parameters
[in,out]psString to convert
[in]fromCurrent character set
[in]toTarget character set
[in]flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
0Success
-1Invalid arguments or failure to open an iconv channel
errnoFailure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 751 of file charset.c.

752{
753 if (!ps)
754 return -1;
755
756 char *s = *ps;
757
758 if (!s || (*s == '\0'))
759 return 0;
760
761 if (!to || !from)
762 return -1;
763
764 const char *repls[] = { "\357\277\275", "?", 0 };
765 int rc = 0;
766
767 iconv_t cd = mutt_ch_iconv_open(to, from, flags);
768 if (cd == (iconv_t) -1)
769 return -1;
770
771 const char **inrepls = NULL;
772 const char *outrepl = NULL;
773
774 if (mutt_ch_is_utf8(to))
775 outrepl = "\357\277\275";
776 else if (mutt_ch_is_utf8(from))
777 inrepls = repls;
778 else
779 outrepl = "?";
780
781 const char *ib = s;
782 size_t ibl = strlen(s);
783 if (ibl >= (SIZE_MAX / MB_LEN_MAX))
784 {
785 iconv_close(cd);
786 return -1;
787 }
788 size_t obl = MB_LEN_MAX * ibl;
789 char *buf = mutt_mem_malloc(obl + 1);
790 char *ob = buf;
791
792 mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
793 iconv(cd, 0, 0, &ob, &obl);
794 iconv_close(cd);
795
796 *ob = '\0';
797
798 FREE(ps);
799 *ps = buf;
800
801 mutt_str_adjust(ps);
802 return rc;
803}
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:616
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:370
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv fc)

Convert a file's character set.

Parameters
fcFgetConv handle
Return values
numNext character in the converted file
EOFError

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 908 of file charset.c.

909{
910 if (!fc)
911 return EOF;
912 if (fc->cd == (iconv_t) -1)
913 return fgetc(fc->fp);
914 if (!fc->p)
915 return EOF;
916 if (fc->p < fc->ob)
917 return (unsigned char) *(fc->p)++;
918
919 /* Try to convert some more */
920 fc->p = fc->bufo;
921 fc->ob = fc->bufo;
922 if (fc->ibl)
923 {
924 size_t obl = sizeof(fc->bufo);
925 iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
926 if (fc->p < fc->ob)
927 return (unsigned char) *(fc->p)++;
928 }
929
930 /* If we trusted iconv a bit more, we would at this point
931 * ask why it had stopped converting ... */
932
933 /* Try to read some more */
934 if ((fc->ibl == sizeof(fc->bufi)) ||
935 (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
936 {
937 fc->p = 0;
938 return EOF;
939 }
940 if (fc->ibl)
941 memcpy(fc->bufi, fc->ib, fc->ibl);
942 fc->ib = fc->bufi;
943 fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
944
945 /* Try harder this time to convert some */
946 if (fc->ibl)
947 {
948 size_t obl = sizeof(fc->bufo);
949 mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
950 fc->inrepls, 0, NULL);
951 if (fc->p < fc->ob)
952 return (unsigned char) *(fc->p)++;
953 }
954
955 /* Either the file has finished or one of the buffers is too small */
956 fc->p = 0;
957 return EOF;
958}
char bufi[512]
Definition: charset.h:44
iconv_t cd
Definition: charset.h:43
char bufo[512]
Definition: charset.h:45
size_t ibl
Definition: charset.h:49
FILE * fp
Definition: charset.h:42
char * p
Definition: charset.h:46
const char ** inrepls
Definition: charset.h:50
char * ib
Definition: charset.h:48
char * ob
Definition: charset.h:47
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv **  fc)

Close an fgetconv handle.

Parameters
[out]fcfgetconv handle

Definition at line 888 of file charset.c.

889{
890 if (!fc || !*fc)
891 return;
892
893 if ((*fc)->cd != (iconv_t) -1)
894 iconv_close((*fc)->cd);
895 FREE(fc);
896}
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv * mutt_ch_fgetconv_open ( FILE *  fp,
const char *  from,
const char *  to,
uint8_t  flags 
)

Prepare a file for charset conversion.

Parameters
fpFILE ptr to prepare
fromCurrent character set
toDestination character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptrfgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 856 of file charset.c.

857{
858 struct FgetConv *fc = NULL;
859 iconv_t cd = (iconv_t) -1;
860
861 if (from && to)
862 cd = mutt_ch_iconv_open(to, from, flags);
863
864 if (cd != (iconv_t) -1)
865 {
866 static const char *repls[] = { "\357\277\275", "?", 0 };
867
868 fc = mutt_mem_malloc(sizeof(struct FgetConv));
869 fc->p = fc->bufo;
870 fc->ob = fc->bufo;
871 fc->ib = fc->bufi;
872 fc->ibl = 0;
873 fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
874 }
875 else
876 {
877 fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
878 }
879 fc->fp = fp;
880 fc->cd = cd;
881 return fc;
882}
A dummy converter.
Definition: charset.h:57
Cursor for converting a file's encoding.
Definition: charset.h:41
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char * mutt_ch_fgetconvs ( char *  buf,
size_t  buflen,
struct FgetConv fc 
)

Convert a file's charset into a string buffer.

Parameters
bufBuffer for result
buflenLength of buffer
fcFgetConv handle
Return values
ptrSuccess, result buffer
NULLError

Read a file into a buffer, converting the character set as it goes.

Definition at line 970 of file charset.c.

971{
972 if (!buf)
973 return NULL;
974
975 size_t r;
976 for (r = 0; (r + 1) < buflen;)
977 {
978 const int c = mutt_ch_fgetconv(fc);
979 if (c == EOF)
980 break;
981 buf[r++] = (char) c;
982 if (c == '\n')
983 break;
984 }
985 buf[r] = '\0';
986
987 if (r > 0)
988 return buf;
989
990 return NULL;
991}
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:908
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_default_charset()

const char * mutt_ch_get_default_charset ( const struct Slist *const  assumed_charset)

Get the default character set.

Parameters
assumed_charsetFrom $assumed_charset
Return values
ptrName of the default character set
Warning
This returns a pointer to a static buffer. Do not free it.

Definition at line 439 of file charset.c.

440{
441 static char fcharset[128];
442 const char *c = NULL;
443
444 if (assumed_charset && (assumed_charset->count > 0))
445 c = STAILQ_FIRST(&assumed_charset->head)->data;
446 else
447 c = "us-ascii";
448
449 mutt_str_copy(fcharset, c, sizeof(fcharset));
450 return fcharset;
451}
#define STAILQ_FIRST(head)
Definition: queue.h:350
size_t count
Number of values in list.
Definition: slist.h:49
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char * mutt_ch_get_langinfo_charset ( void  )

Get the user's choice of character set.

Return values
ptrCharset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 460 of file charset.c.

461{
462 char buf[1024] = { 0 };
463
464 mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
465
466 if (buf[0] != '\0')
467 return mutt_str_dup(buf);
468
469 return mutt_str_dup("iso-8859-1");
470}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv ( iconv_t  cd,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft,
const char **  inrepls,
const char *  outrepl,
int *  iconverrno 
)

Change the encoding of a string.

Parameters
[in]cdIconv conversion descriptor
[in,out]inbufBuffer to convert
[in,out]inbytesleftLength of buffer to convert
[in,out]outbufBuffer for the result
[in,out]outbytesleftLength of result buffer
[in]inreplsInput replacement characters
[in]outreplOutput replacement characters
[out]iconverrnoErrno if iconv() fails, 0 if it succeeds
Return values
numCharacters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 616 of file charset.c.

619{
620 size_t rc = 0;
621 const char *ib = *inbuf;
622 size_t ibl = *inbytesleft;
623 char *ob = *outbuf;
624 size_t obl = *outbytesleft;
625
626 while (true)
627 {
628 errno = 0;
629 const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
630 if (ret1 != (size_t) -1)
631 rc += ret1;
632 if (iconverrno)
633 *iconverrno = errno;
634
635 if (ibl && obl && (errno == EILSEQ))
636 {
637 if (inrepls)
638 {
639 /* Try replacing the input */
640 const char **t = NULL;
641 for (t = inrepls; *t; t++)
642 {
643 const char *ib1 = *t;
644 size_t ibl1 = strlen(*t);
645 char *ob1 = ob;
646 size_t obl1 = obl;
647 iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
648 if (ibl1 == 0)
649 {
650 ib++;
651 ibl--;
652 ob = ob1;
653 obl = obl1;
654 rc++;
655 break;
656 }
657 }
658 if (*t)
659 continue;
660 }
661 /* Replace the output */
662 if (!outrepl)
663 outrepl = "?";
664 iconv(cd, NULL, NULL, &ob, &obl);
665 if (obl)
666 {
667 int n = strlen(outrepl);
668 if (n > obl)
669 {
670 outrepl = "?";
671 n = 1;
672 }
673 memcpy(ob, outrepl, n);
674 ib++;
675 ibl--;
676 ob += n;
677 obl -= n;
678 rc++;
679 iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
680 continue;
681 }
682 }
683 *inbuf = ib;
684 *inbytesleft = ibl;
685 *outbuf = ob;
686 *outbytesleft = obl;
687 return rc;
688 }
689}
#define EILSEQ
Definition: charset.c:49
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char * mutt_ch_iconv_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'iconv-hook' matches)
NULLNo matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 700 of file charset.c.

701{
703}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open ( const char *  tocode,
const char *  fromcode,
uint8_t  flags 
)

Set up iconv for conversions.

Parameters
tocodeCurrent character set
fromcodeTarget character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptriconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Note
By design charset-hooks should never be, and are never, applied to tocode.
The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 563 of file charset.c.

564{
565 char tocode1[128];
566 char fromcode1[128];
567 const char *tocode2 = NULL, *fromcode2 = NULL;
568 const char *tmp = NULL;
569
570 iconv_t cd;
571
572 /* transform to MIME preferred charset names */
573 mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
574 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
575
576 /* maybe apply charset-hooks and recanonicalise fromcode,
577 * but only when caller asked us to sanitize a potentially wrong
578 * charset name incoming from the wild exterior. */
579 if (flags & MUTT_ICONV_HOOK_FROM)
580 {
581 tmp = mutt_ch_charset_lookup(fromcode1);
582 if (tmp)
583 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
584 }
585
586 /* always apply iconv-hooks to suit system's iconv tastes */
587 tocode2 = mutt_ch_iconv_lookup(tocode1);
588 tocode2 = tocode2 ? tocode2 : tocode1;
589 fromcode2 = mutt_ch_iconv_lookup(fromcode1);
590 fromcode2 = fromcode2 ? fromcode2 : fromcode1;
591
592 /* call system iconv with names it appreciates */
593 cd = iconv_open(tocode2, fromcode2);
594 if (cd != (iconv_t) -1)
595 return cd;
596
597 return (iconv_t) -1;
598}
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:700
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:536
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add ( enum LookupType  type,
const char *  pat,
const char *  replace,
struct Buffer err 
)

Add a new character set lookup.

Parameters
typeType of character set, e.g. MUTT_LOOKUP_CHARSET
patPattern to match
replaceReplacement string
errBuffer for error message
Return values
trueLookup added to list
falseRegex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 483 of file charset.c.

485{
486 if (!pat || !replace)
487 return false;
488
489 regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
490 int rc = REG_COMP(rx, pat, REG_ICASE);
491 if (rc != 0)
492 {
493 regerror(rc, rx, err->data, err->dsize);
494 FREE(&rx);
495 return false;
496 }
497
498 struct Lookup *l = lookup_new();
499 l->type = type;
500 l->replacement = mutt_str_dup(replace);
501 l->regex.pattern = mutt_str_dup(pat);
502 l->regex.regex = rx;
503 l->regex.pat_not = false;
504
505 TAILQ_INSERT_TAIL(&Lookups, l, entries);
506
507 return true;
508}
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
static struct LookupList Lookups
Definition: charset.c:76
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:245
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:53
size_t dsize
Length of data.
Definition: buffer.h:37
char * data
Pointer to data.
Definition: buffer.h:35
Regex to String lookup table.
Definition: charset.c:68
char * replacement
Alternative charset to use.
Definition: charset.c:71
enum LookupType type
Lookup type.
Definition: charset.c:69
struct Regex regex
Regular expression.
Definition: charset.c:70
char * pattern
printable version
Definition: regex3.h:90
bool pat_not
do not match
Definition: regex3.h:92
regex_t * regex
compiled expression
Definition: regex3.h:91
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void  )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 515 of file charset.c.

516{
517 struct Lookup *l = NULL;
518 struct Lookup *tmp = NULL;
519
520 TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
521 {
522 TAILQ_REMOVE(&Lookups, l, entries);
523 lookup_free(&l);
524 }
525}
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:254
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char *  charset)

Update the records for a new character set.

Parameters
charsetNew character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note
This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1003 of file charset.c.

1004{
1005 char buf[256] = { 0 };
1006
1007 mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1008
1009 if (mutt_ch_is_utf8(buf))
1010 {
1011 CharsetIsUtf8 = true;
1012 ReplacementChar = 0xfffd; /* replacement character */
1013 }
1014 else
1015 {
1016 CharsetIsUtf8 = false;
1017 ReplacementChar = '?';
1018 }
1019
1020#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1021 bind_textdomain_codeset(PACKAGE, buf);
1022#endif
1023}
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:60
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:55
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ CharsetIsUtf8

bool CharsetIsUtf8
extern

Is the user's current character set utf-8?

Definition at line 60 of file charset.c.

◆ ReplacementChar

wchar_t ReplacementChar
extern

When a Unicode character can't be displayed, use this instead.

Definition at line 55 of file charset.c.