NeoMutt
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
charset.c
Go to the documentation of this file.
1
29#include "config.h"
30#include <ctype.h>
31#include <errno.h>
32#include <iconv.h>
33#include <langinfo.h>
34#include <limits.h>
35#include <stdbool.h>
36#include <stdio.h>
37#include <string.h>
38#include "charset.h"
39#include "buffer.h"
40#include "list.h"
41#include "logging2.h"
42#include "memory.h"
43#include "queue.h"
44#include "regex3.h"
45#include "slist.h"
46#include "string2.h"
47#ifdef ENABLE_NLS
48#include <libintl.h>
49#endif
50
51#ifndef EILSEQ
52#define EILSEQ EINVAL
53#endif
54
58wchar_t ReplacementChar = '?';
59
63bool CharsetIsUtf8 = false;
64
70struct Lookup
71{
73 struct Regex regex;
75 TAILQ_ENTRY(Lookup) entries;
76};
77TAILQ_HEAD(LookupList, Lookup);
78
80static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
81
86{
87 char *fromcode1;
88 char *tocode1;
89 iconv_t cd;
90};
91
93#define ICONV_CACHE_SIZE 16
97static int IconvCacheUsed = 0;
98
103{
104 const char *key;
105 const char *pref;
106};
107
118static const struct MimeNames PreferredMimeNames[] = {
119 // clang-format off
120 { "ansi_x3.4-1968", "us-ascii" },
121 { "iso-ir-6", "us-ascii" },
122 { "iso_646.irv:1991", "us-ascii" },
123 { "ascii", "us-ascii" },
124 { "iso646-us", "us-ascii" },
125 { "us", "us-ascii" },
126 { "ibm367", "us-ascii" },
127 { "cp367", "us-ascii" },
128 { "csASCII", "us-ascii" },
129
130 { "csISO2022KR", "iso-2022-kr" },
131 { "csEUCKR", "euc-kr" },
132 { "csISO2022JP", "iso-2022-jp" },
133 { "csISO2022JP2", "iso-2022-jp-2" },
134
135 { "ISO_8859-1:1987", "iso-8859-1" },
136 { "iso-ir-100", "iso-8859-1" },
137 { "iso_8859-1", "iso-8859-1" },
138 { "latin1", "iso-8859-1" },
139 { "l1", "iso-8859-1" },
140 { "IBM819", "iso-8859-1" },
141 { "CP819", "iso-8859-1" },
142 { "csISOLatin1", "iso-8859-1" },
143
144 { "ISO_8859-2:1987", "iso-8859-2" },
145 { "iso-ir-101", "iso-8859-2" },
146 { "iso_8859-2", "iso-8859-2" },
147 { "latin2", "iso-8859-2" },
148 { "l2", "iso-8859-2" },
149 { "csISOLatin2", "iso-8859-2" },
150
151 { "ISO_8859-3:1988", "iso-8859-3" },
152 { "iso-ir-109", "iso-8859-3" },
153 { "ISO_8859-3", "iso-8859-3" },
154 { "latin3", "iso-8859-3" },
155 { "l3", "iso-8859-3" },
156 { "csISOLatin3", "iso-8859-3" },
157
158 { "ISO_8859-4:1988", "iso-8859-4" },
159 { "iso-ir-110", "iso-8859-4" },
160 { "ISO_8859-4", "iso-8859-4" },
161 { "latin4", "iso-8859-4" },
162 { "l4", "iso-8859-4" },
163 { "csISOLatin4", "iso-8859-4" },
164
165 { "ISO_8859-6:1987", "iso-8859-6" },
166 { "iso-ir-127", "iso-8859-6" },
167 { "iso_8859-6", "iso-8859-6" },
168 { "ECMA-114", "iso-8859-6" },
169 { "ASMO-708", "iso-8859-6" },
170 { "arabic", "iso-8859-6" },
171 { "csISOLatinArabic", "iso-8859-6" },
172
173 { "ISO_8859-7:1987", "iso-8859-7" },
174 { "iso-ir-126", "iso-8859-7" },
175 { "ISO_8859-7", "iso-8859-7" },
176 { "ELOT_928", "iso-8859-7" },
177 { "ECMA-118", "iso-8859-7" },
178 { "greek", "iso-8859-7" },
179 { "greek8", "iso-8859-7" },
180 { "csISOLatinGreek", "iso-8859-7" },
181
182 { "ISO_8859-8:1988", "iso-8859-8" },
183 { "iso-ir-138", "iso-8859-8" },
184 { "ISO_8859-8", "iso-8859-8" },
185 { "hebrew", "iso-8859-8" },
186 { "csISOLatinHebrew", "iso-8859-8" },
187
188 { "ISO_8859-5:1988", "iso-8859-5" },
189 { "iso-ir-144", "iso-8859-5" },
190 { "ISO_8859-5", "iso-8859-5" },
191 { "cyrillic", "iso-8859-5" },
192 { "csISOLatinCyrillic", "iso-8859-5" },
193
194 { "ISO_8859-9:1989", "iso-8859-9" },
195 { "iso-ir-148", "iso-8859-9" },
196 { "ISO_8859-9", "iso-8859-9" },
197 { "latin5", "iso-8859-9" }, /* this is not a bug */
198 { "l5", "iso-8859-9" },
199 { "csISOLatin5", "iso-8859-9" },
200
201 { "ISO_8859-10:1992", "iso-8859-10" },
202 { "iso-ir-157", "iso-8859-10" },
203 { "latin6", "iso-8859-10" }, /* this is not a bug */
204 { "l6", "iso-8859-10" },
205 { "csISOLatin6", "iso-8859-10" },
206
207 { "csKOI8r", "koi8-r" },
208
209 { "MS_Kanji", "Shift_JIS" }, /* Note the underscore! */
210 { "csShiftJis", "Shift_JIS" },
211
212 { "Extended_UNIX_Code_Packed_Format_for_Japanese",
213 "euc-jp" },
214 { "csEUCPkdFmtJapanese", "euc-jp" },
215
216 { "csGB2312", "gb2312" },
217 { "csbig5", "big5" },
218
219 /* End of official brain damage.
220 * What follows has been taken from glibc's localedata files. */
221
222 { "iso_8859-13", "iso-8859-13" },
223 { "iso-ir-179", "iso-8859-13" },
224 { "latin7", "iso-8859-13" }, /* this is not a bug */
225 { "l7", "iso-8859-13" },
226
227 { "iso_8859-14", "iso-8859-14" },
228 { "latin8", "iso-8859-14" }, /* this is not a bug */
229 { "l8", "iso-8859-14" },
230
231 { "iso_8859-15", "iso-8859-15" },
232 { "latin9", "iso-8859-15" }, /* this is not a bug */
233
234 /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
235 { "latin0", "iso-8859-15" }, /* this is not a bug */
236
237 { "iso_8859-16", "iso-8859-16" },
238 { "latin10", "iso-8859-16" }, /* this is not a bug */
239
240 { "646", "us-ascii" },
241
242 /* http://www.sun.com/software/white-papers/wp-unicode/ */
243
244 { "eucJP", "euc-jp" },
245 { "PCK", "Shift_JIS" },
246 { "ko_KR-euc", "euc-kr" },
247 { "zh_TW-big5", "big5" },
248
249 /* seems to be common on some systems */
250
251 { "sjis", "Shift_JIS" },
252 { "euc-jp-ms", "eucJP-ms" },
253
254 /* If you happen to encounter system-specific brain-damage with respect to
255 * character set naming, please add it above this comment, and submit a patch
256 * to <neomutt-devel@neomutt.org> */
257
258 { NULL, NULL },
259 // clang-format on
260};
261
266static struct Lookup *lookup_new(void)
267{
268 return mutt_mem_calloc(1, sizeof(struct Lookup));
269}
270
275static void lookup_free(struct Lookup **ptr)
276{
277 if (!ptr || !*ptr)
278 return;
279
280 struct Lookup *l = *ptr;
281 FREE(&l->replacement);
282 FREE(&l->regex.pattern);
283 if (l->regex.regex)
284 regfree(l->regex.regex);
285 FREE(&l->regex.regex);
286 FREE(&l->regex);
287
288 FREE(ptr);
289}
290
300static const char *lookup_charset(enum LookupType type, const char *cs)
301{
302 if (!cs)
303 return NULL;
304
305 struct Lookup *l = NULL;
306
307 TAILQ_FOREACH(l, &Lookups, entries)
308 {
309 if (l->type != type)
310 continue;
311 if (mutt_regex_match(&l->regex, cs))
312 return l->replacement;
313 }
314 return NULL;
315}
316
328int mutt_ch_convert_nonmime_string(const struct Slist *const assumed_charset,
329 const char *charset, char **ps)
330{
331 if (!ps)
332 return -1;
333
334 char *u = *ps;
335 const size_t ulen = mutt_str_len(u);
336 if (ulen == 0)
337 return 0;
338
339 const struct ListNode *np = NULL;
340 STAILQ_FOREACH(np, &assumed_charset->head, entries)
341 {
342 char const *c = np->data;
343 size_t n = mutt_str_len(c);
344 char *fromcode = mutt_mem_malloc(n + 1);
345 mutt_str_copy(fromcode, c, n + 1);
346 char *s = mutt_strn_dup(u, ulen);
347 int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);
348 FREE(&fromcode);
349 if (m == 0)
350 {
351 FREE(ps);
352 *ps = s;
353 return 0;
354 }
355 FREE(&s);
356 }
358 charset, MUTT_ICONV_HOOK_FROM);
359 return -1;
360}
361
371void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
372{
373 if (!buf || !name)
374 return;
375
376 char in[1024], scratch[1024 + 10];
377
378 mutt_str_copy(in, name, sizeof(in));
379 char *ext = strchr(in, '/');
380 if (ext)
381 *ext++ = '\0';
382
383 if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
384 {
385 mutt_str_copy(buf, "utf-8", buflen);
386 goto out;
387 }
388
389 /* catch some common iso-8859-something misspellings */
390 size_t plen;
391 if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
392 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
393 else if ((plen = mutt_istr_startswith(in, "8859-")))
394 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
395 else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
396 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
397 else if ((plen = mutt_istr_startswith(in, "iso8859-")))
398 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
399 else
400 mutt_str_copy(scratch, in, sizeof(scratch));
401
402 for (size_t i = 0; PreferredMimeNames[i].key; i++)
403 {
404 if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
405 {
406 mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
407 goto out;
408 }
409 }
410
411 mutt_str_copy(buf, scratch, buflen);
412
413 /* for cosmetics' sake, transform to lowercase. */
414 for (char *p = buf; *p; p++)
415 *p = tolower(*p);
416
417out:
418 if (ext && *ext)
419 {
420 mutt_str_cat(buf, buflen, "/");
421 mutt_str_cat(buf, buflen, ext);
422 }
423}
424
437bool mutt_ch_chscmp(const char *cs1, const char *cs2)
438{
439 if (!cs1 || !cs2)
440 return false;
441
442 char buf[256] = { 0 };
443
444 mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
445
446 int len1 = mutt_str_len(buf);
447 int len2 = mutt_str_len(cs2);
448
449 return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
450 ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
451}
452
460const char *mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
461{
462 static char fcharset[128];
463 const char *c = NULL;
464
465 if (assumed_charset && (assumed_charset->count > 0))
466 c = STAILQ_FIRST(&assumed_charset->head)->data;
467 else
468 c = "us-ascii";
469
470 mutt_str_copy(fcharset, c, sizeof(fcharset));
471 return fcharset;
472}
473
482{
483 char buf[1024] = { 0 };
484
485 mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
486
487 if (buf[0] != '\0')
488 return mutt_str_dup(buf);
489
490 return mutt_str_dup("iso-8859-1");
491}
492
504bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
505 const char *replace, struct Buffer *err)
506{
507 if (!pat || !replace)
508 return false;
509
510 regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
511 int rc = REG_COMP(rx, pat, REG_ICASE);
512 if (rc != 0)
513 {
514 regerror(rc, rx, err->data, err->dsize);
515 FREE(&rx);
516 return false;
517 }
518
519 struct Lookup *l = lookup_new();
520 l->type = type;
521 l->replacement = mutt_str_dup(replace);
522 l->regex.pattern = mutt_str_dup(pat);
523 l->regex.regex = rx;
524 l->regex.pat_not = false;
525
526 TAILQ_INSERT_TAIL(&Lookups, l, entries);
527
528 return true;
529}
530
537{
538 struct Lookup *l = NULL;
539 struct Lookup *tmp = NULL;
540
541 TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
542 {
543 TAILQ_REMOVE(&Lookups, l, entries);
544 lookup_free(&l);
545 }
546}
547
557const char *mutt_ch_charset_lookup(const char *chs)
558{
560}
561
589iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
590{
591 char tocode1[128];
592 char fromcode1[128];
593 const char *tocode2 = NULL, *fromcode2 = NULL;
594 const char *tmp = NULL;
595
596 /* transform to MIME preferred charset names */
597 mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
598 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
599
600 /* maybe apply charset-hooks and recanonicalise fromcode,
601 * but only when caller asked us to sanitize a potentially wrong
602 * charset name incoming from the wild exterior. */
603 if (flags & MUTT_ICONV_HOOK_FROM)
604 {
605 tmp = mutt_ch_charset_lookup(fromcode1);
606 if (tmp)
607 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
608 }
609
610 /* check if we have this pair cached already */
611 for (int i = 0; i < IconvCacheUsed; ++i)
612 {
613 if (strcmp(tocode1, IconvCache[i].tocode1) == 0 &&
614 strcmp(fromcode1, IconvCache[i].fromcode1) == 0)
615 {
616 iconv_t cd = IconvCache[i].cd;
617
618 /* make room for this one at the top */
619 struct IconvCacheEntry top = IconvCache[i];
620 for (int j = i; j-- > 0;)
621 {
622 IconvCache[j + 1] = IconvCache[j];
623 }
624 IconvCache[0] = top;
625
626 if (iconv_t_valid(cd))
627 {
628 /* reset state */
629 iconv(cd, NULL, NULL, NULL, NULL);
630 }
631 return cd;
632 }
633 }
634
635 /* not found in cache */
636 /* always apply iconv-hooks to suit system's iconv tastes */
637 tocode2 = mutt_ch_iconv_lookup(tocode1);
638 tocode2 = tocode2 ? tocode2 : tocode1;
639 fromcode2 = mutt_ch_iconv_lookup(fromcode1);
640 fromcode2 = fromcode2 ? fromcode2 : fromcode1;
641
642 /* call system iconv with names it appreciates */
643 iconv_t cd = iconv_open(tocode2, fromcode2);
644
646 {
647 mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n",
650 /* get rid of the oldest entry */
654 {
655 iconv_close(IconvCache[IconvCacheUsed - 1].cd);
656 }
658 }
659
660 /* make room for this one at the top */
661 for (int j = IconvCacheUsed; j-- > 0;)
662 {
663 IconvCache[j + 1] = IconvCache[j];
664 }
665
667
668 mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1);
669 IconvCache[0].fromcode1 = strdup(fromcode1);
670 IconvCache[0].tocode1 = strdup(tocode1);
671 IconvCache[0].cd = cd;
672
673 return cd;
674}
675
692size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
693 char **outbuf, size_t *outbytesleft, const char **inrepls,
694 const char *outrepl, int *iconverrno)
695{
696 size_t rc = 0;
697 const char *ib = *inbuf;
698 size_t ibl = *inbytesleft;
699 char *ob = *outbuf;
700 size_t obl = *outbytesleft;
701
702 while (true)
703 {
704 errno = 0;
705 const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
706 if (ret1 != ICONV_ILLEGAL_SEQ)
707 rc += ret1;
708 if (iconverrno)
709 *iconverrno = errno;
710
711 if (ibl && obl && (errno == EILSEQ))
712 {
713 if (inrepls)
714 {
715 /* Try replacing the input */
716 const char **t = NULL;
717 for (t = inrepls; *t; t++)
718 {
719 const char *ib1 = *t;
720 size_t ibl1 = strlen(*t);
721 char *ob1 = ob;
722 size_t obl1 = obl;
723 iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
724 if (ibl1 == 0)
725 {
726 ib++;
727 ibl--;
728 ob = ob1;
729 obl = obl1;
730 rc++;
731 break;
732 }
733 }
734 if (*t)
735 continue;
736 }
737 /* Replace the output */
738 if (!outrepl)
739 outrepl = "?";
740 iconv(cd, NULL, NULL, &ob, &obl);
741 if (obl)
742 {
743 int n = strlen(outrepl);
744 if (n > obl)
745 {
746 outrepl = "?";
747 n = 1;
748 }
749 memcpy(ob, outrepl, n);
750 ib++;
751 ibl--;
752 ob += n;
753 obl -= n;
754 rc++;
755 iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
756 continue;
757 }
758 }
759 *inbuf = ib;
760 *inbytesleft = ibl;
761 *outbuf = ob;
762 *outbytesleft = obl;
763 return rc;
764 }
765}
766
776const char *mutt_ch_iconv_lookup(const char *chs)
777{
779}
780
791int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
792{
793 if (!s || !from || !to)
794 return -1;
795
796 int rc = 0;
797 iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
798 if (!iconv_t_valid(cd))
799 return -1;
800
801 size_t outlen = MB_LEN_MAX * slen;
802 char *out = mutt_mem_malloc(outlen + 1);
803 char *saved_out = out;
804
805 const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
806 if (convlen == ICONV_ILLEGAL_SEQ)
807 rc = errno;
808
809 FREE(&saved_out);
810 return rc;
811}
812
826int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
827{
828 if (!ps)
829 return -1;
830
831 char *s = *ps;
832
833 if (!s || (*s == '\0'))
834 return 0;
835
836 if (!to || !from)
837 return -1;
838
839 const char *repls[] = { "\357\277\275", "?", 0 };
840 int rc = 0;
841
842 iconv_t cd = mutt_ch_iconv_open(to, from, flags);
843 if (!iconv_t_valid(cd))
844 return -1;
845
846 const char **inrepls = NULL;
847 const char *outrepl = NULL;
848
849 if (mutt_ch_is_utf8(to))
850 outrepl = "\357\277\275";
851 else if (mutt_ch_is_utf8(from))
852 inrepls = repls;
853 else
854 outrepl = "?";
855
856 const char *ib = s;
857 size_t ibl = strlen(s);
858 if (ibl >= (SIZE_MAX / MB_LEN_MAX))
859 {
860 return -1;
861 }
862 size_t obl = MB_LEN_MAX * ibl;
863 char *buf = mutt_mem_malloc(obl + 1);
864 char *ob = buf;
865
866 mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
867 iconv(cd, 0, 0, &ob, &obl);
868
869 *ob = '\0';
870
871 FREE(ps);
872 *ps = buf;
873
874 mutt_str_adjust(ps);
875 return rc;
876}
877
889bool mutt_ch_check_charset(const char *cs, bool strict)
890{
891 if (!cs)
892 return false;
893
894 if (mutt_ch_is_utf8(cs))
895 return true;
896
897 if (!strict)
898 {
899 for (int i = 0; PreferredMimeNames[i].key; i++)
900 {
901 if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
903 {
904 return true;
905 }
906 }
907 }
908
909 iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
910 if (iconv_t_valid(cd))
911 {
912 return true;
913 }
914
915 return false;
916}
917
928struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
929{
930 struct FgetConv *fc = NULL;
931 iconv_t cd = ICONV_T_INVALID;
932
933 if (from && to)
934 cd = mutt_ch_iconv_open(to, from, flags);
935
936 if (iconv_t_valid(cd))
937 {
938 static const char *repls[] = { "\357\277\275", "?", 0 };
939
940 fc = mutt_mem_malloc(sizeof(struct FgetConv));
941 fc->p = fc->bufo;
942 fc->ob = fc->bufo;
943 fc->ib = fc->bufi;
944 fc->ibl = 0;
945 fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
946 }
947 else
948 {
949 fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
950 }
951 fc->fp = fp;
952 fc->cd = cd;
953 return fc;
954}
955
961{
962 if (!ptr || !*ptr)
963 return;
964
965 FREE(ptr);
966}
967
979{
980 if (!fc)
981 return EOF;
982 if (!iconv_t_valid(fc->cd))
983 return fgetc(fc->fp);
984 if (!fc->p)
985 return EOF;
986 if (fc->p < fc->ob)
987 return (unsigned char) *(fc->p)++;
988
989 /* Try to convert some more */
990 fc->p = fc->bufo;
991 fc->ob = fc->bufo;
992 if (fc->ibl)
993 {
994 size_t obl = sizeof(fc->bufo);
995 iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
996 if (fc->p < fc->ob)
997 return (unsigned char) *(fc->p)++;
998 }
999
1000 /* If we trusted iconv a bit more, we would at this point
1001 * ask why it had stopped converting ... */
1002
1003 /* Try to read some more */
1004 if ((fc->ibl == sizeof(fc->bufi)) ||
1005 (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
1006 {
1007 fc->p = 0;
1008 return EOF;
1009 }
1010 if (fc->ibl)
1011 memcpy(fc->bufi, fc->ib, fc->ibl);
1012 fc->ib = fc->bufi;
1013 fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
1014
1015 /* Try harder this time to convert some */
1016 if (fc->ibl)
1017 {
1018 size_t obl = sizeof(fc->bufo);
1019 mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
1020 fc->inrepls, 0, NULL);
1021 if (fc->p < fc->ob)
1022 return (unsigned char) *(fc->p)++;
1023 }
1024
1025 /* Either the file has finished or one of the buffers is too small */
1026 fc->p = 0;
1027 return EOF;
1028}
1029
1040char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
1041{
1042 if (!buf)
1043 return NULL;
1044
1045 size_t r;
1046 for (r = 0; (r + 1) < buflen;)
1047 {
1048 const int c = mutt_ch_fgetconv(fc);
1049 if (c == EOF)
1050 break;
1051 buf[r++] = (char) c;
1052 if (c == '\n')
1053 break;
1054 }
1055 buf[r] = '\0';
1056
1057 if (r > 0)
1058 return buf;
1059
1060 return NULL;
1061}
1062
1073void mutt_ch_set_charset(const char *charset)
1074{
1075 char buf[256] = { 0 };
1076
1077 mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1078
1079 if (mutt_ch_is_utf8(buf))
1080 {
1081 CharsetIsUtf8 = true;
1082 ReplacementChar = 0xfffd; /* replacement character */
1083 }
1084 else
1085 {
1086 CharsetIsUtf8 = false;
1087 ReplacementChar = '?';
1088 }
1089
1090#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1091 bind_textdomain_codeset(PACKAGE, buf);
1092#endif
1093}
1094
1106char *mutt_ch_choose(const char *fromcode, const struct Slist *charsets,
1107 const char *u, size_t ulen, char **d, size_t *dlen)
1108{
1109 if (!fromcode || !charsets)
1110 return NULL;
1111
1112 char *e = NULL, *tocode = NULL;
1113 size_t elen = 0, bestn = 0;
1114
1115 const struct ListNode *np = NULL;
1116 STAILQ_FOREACH(np, &charsets->head, entries)
1117 {
1118 char *t = mutt_str_dup(np->data);
1119 if (!t)
1120 continue;
1121
1122 size_t n = mutt_str_len(t);
1123 char *s = mutt_strn_dup(u, ulen);
1124 const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1125 mutt_ch_check(s, ulen, fromcode, t);
1126 if (rc)
1127 {
1128 FREE(&t);
1129 FREE(&s);
1130 continue;
1131 }
1132 size_t slen = mutt_str_len(s);
1133
1134 if (!tocode || (n < bestn))
1135 {
1136 bestn = n;
1137 FREE(&tocode);
1138 tocode = t;
1139 if (d)
1140 {
1141 FREE(&e);
1142 e = s;
1143 }
1144 else
1145 {
1146 FREE(&s);
1147 }
1148 elen = slen;
1149 }
1150 else
1151 {
1152 FREE(&t);
1153 FREE(&s);
1154 }
1155 }
1156 if (tocode)
1157 {
1158 if (d)
1159 *d = e;
1160 if (dlen)
1161 *dlen = elen;
1162
1163 char canonical_buf[1024] = { 0 };
1164 mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1165 mutt_str_replace(&tocode, canonical_buf);
1166 }
1167 return tocode;
1168}
1169
1174{
1175 for (int i = 0; i < IconvCacheUsed; ++i)
1176 {
1177 FREE(&IconvCache[i].fromcode1);
1178 FREE(&IconvCache[i].tocode1);
1179 if (iconv_t_valid(IconvCache[i].cd))
1180 {
1181 iconv_close(IconvCache[i].cd);
1182 }
1183 }
1184 IconvCacheUsed = 0;
1185}
General purpose object for storing and parsing strings.
#define mutt_debug(LEVEL,...)
Definition: logging2.h:89
Singly-linked list type.
Logging Dispatcher.
@ LL_DEBUG2
Log at debug level 2.
Definition: logging2.h:44
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
Memory management wrappers.
#define FREE(x)
Definition: memory.h:45
#define MIN(a, b)
Definition: memory.h:32
bool mutt_ch_check_charset(const char *cs, bool strict)
Does iconv understand a character set?
Definition: charset.c:889
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:692
void mutt_ch_lookup_remove(void)
Remove all the character set lookups.
Definition: charset.c:536
static int IconvCacheUsed
Number of iconv descriptors in the cache.
Definition: charset.c:97
char * mutt_ch_choose(const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
Figure the best charset to encode a string.
Definition: charset.c:1106
int mutt_ch_convert_nonmime_string(const struct Slist *const assumed_charset, const char *charset, char **ps)
Try to convert a string using a list of character sets.
Definition: charset.c:328
static struct LookupList Lookups
Lookup table of preferred character set names.
Definition: charset.c:80
char * mutt_ch_get_langinfo_charset(void)
Get the user's choice of character set.
Definition: charset.c:481
static const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:118
bool mutt_ch_lookup_add(enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
Add a new character set lookup.
Definition: charset.c:504
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:371
void mutt_ch_cache_cleanup(void)
Clean up the cached iconv handles and charset strings.
Definition: charset.c:1173
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:776
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:826
void mutt_ch_set_charset(const char *charset)
Update the records for a new character set.
Definition: charset.c:1073
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:63
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:300
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:791
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:557
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:266
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:978
#define ICONV_CACHE_SIZE
Max size of the iconv cache.
Definition: charset.c:93
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:275
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:58
#define EILSEQ
Definition: charset.c:52
struct FgetConv * mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
Prepare a file for charset conversion.
Definition: charset.c:928
static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
Cache of iconv conversion descriptors.
Definition: charset.c:95
char * mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
Convert a file's charset into a string buffer.
Definition: charset.c:1040
bool mutt_ch_chscmp(const char *cs1, const char *cs2)
Are the names of two character sets equivalent?
Definition: charset.c:437
void mutt_ch_fgetconv_close(struct FgetConv **ptr)
Close an fgetconv handle.
Definition: charset.c:960
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:589
const char * mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
Get the default character set.
Definition: charset.c:460
Conversion between different character encodings.
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
#define ICONV_T_INVALID
Error value for iconv functions.
Definition: charset.h:100
#define mutt_ch_is_utf8(str)
Definition: charset.h:96
LookupType
Types of character set lookups.
Definition: charset.h:66
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:68
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:67
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition: charset.h:103
static bool iconv_t_valid(const iconv_t cd)
Is the conversion descriptor valid?
Definition: charset.h:112
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:636
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:452
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:810
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:251
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:371
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:568
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:653
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:240
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:525
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:327
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:266
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:725
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define STAILQ_FIRST(head)
Definition: queue.h:350
#define TAILQ_HEAD(name, type)
Definition: queue.h:623
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
#define TAILQ_HEAD_INITIALIZER(head)
Definition: queue.h:637
#define TAILQ_ENTRY(type)
Definition: queue.h:640
Manage regular expressions.
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:53
A separated list of strings.
String manipulation functions.
String manipulation buffer.
Definition: buffer.h:34
size_t dsize
Length of data.
Definition: buffer.h:37
char * data
Pointer to data.
Definition: buffer.h:35
A dummy converter.
Definition: charset.h:57
Cursor for converting a file's encoding.
Definition: charset.h:41
char bufi[512]
Definition: charset.h:44
iconv_t cd
iconv conversion descriptor
Definition: charset.h:43
char bufo[512]
Definition: charset.h:45
size_t ibl
Definition: charset.h:49
FILE * fp
Definition: charset.h:42
char * p
Definition: charset.h:46
const char ** inrepls
Definition: charset.h:50
char * ib
Definition: charset.h:48
char * ob
Definition: charset.h:47
Cached iconv conversion descriptor.
Definition: charset.c:86
char * tocode1
Destination character set.
Definition: charset.c:88
char * fromcode1
Source character set.
Definition: charset.c:87
iconv_t cd
iconv conversion descriptor
Definition: charset.c:89
A List node for strings.
Definition: list.h:35
char * data
String.
Definition: list.h:36
Regex to String lookup table.
Definition: charset.c:71
char * replacement
Alternative charset to use.
Definition: charset.c:74
enum LookupType type
Lookup type.
Definition: charset.c:72
struct Regex regex
Regular expression.
Definition: charset.c:73
MIME name lookup entry.
Definition: charset.c:103
const char * key
Definition: charset.c:104
const char * pref
Definition: charset.c:105
Cached regular expression.
Definition: regex3.h:89
char * pattern
printable version
Definition: regex3.h:90
bool pat_not
do not match
Definition: regex3.h:92
regex_t * regex
compiled expression
Definition: regex3.h:91
String list.
Definition: slist.h:47
struct ListHead head
List containing values.
Definition: slist.h:48
size_t count
Number of values in list.
Definition: slist.h:49