NeoMutt  2024-02-01-35-geee02f
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
charset.c
Go to the documentation of this file.
1
32#include "config.h"
33#include <ctype.h>
34#include <errno.h>
35#include <iconv.h>
36#include <langinfo.h>
37#include <limits.h>
38#include <stdbool.h>
39#include <stdio.h>
40#include <string.h>
41#include "charset.h"
42#include "buffer.h"
43#include "list.h"
44#include "logging2.h"
45#include "memory.h"
46#include "queue.h"
47#include "regex3.h"
48#include "slist.h"
49#include "string2.h"
50#ifdef ENABLE_NLS
51#include <libintl.h>
52#endif
53
54#ifndef EILSEQ
55#define EILSEQ EINVAL
56#endif
57
61wchar_t ReplacementChar = '?';
62
66bool CharsetIsUtf8 = false;
67
73struct Lookup
74{
76 struct Regex regex;
78 TAILQ_ENTRY(Lookup) entries;
79};
80TAILQ_HEAD(LookupList, Lookup);
81
83static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
84
89{
90 char *fromcode1;
91 char *tocode1;
92 iconv_t cd;
93};
94
96#define ICONV_CACHE_SIZE 16
100static int IconvCacheUsed = 0;
101
106{
107 const char *key;
108 const char *pref;
109};
110
121static const struct MimeNames PreferredMimeNames[] = {
122 // clang-format off
123 { "ansi_x3.4-1968", "us-ascii" },
124 { "iso-ir-6", "us-ascii" },
125 { "iso_646.irv:1991", "us-ascii" },
126 { "ascii", "us-ascii" },
127 { "iso646-us", "us-ascii" },
128 { "us", "us-ascii" },
129 { "ibm367", "us-ascii" },
130 { "cp367", "us-ascii" },
131 { "csASCII", "us-ascii" },
132
133 { "csISO2022KR", "iso-2022-kr" },
134 { "csEUCKR", "euc-kr" },
135 { "csISO2022JP", "iso-2022-jp" },
136 { "csISO2022JP2", "iso-2022-jp-2" },
137
138 { "ISO_8859-1:1987", "iso-8859-1" },
139 { "iso-ir-100", "iso-8859-1" },
140 { "iso_8859-1", "iso-8859-1" },
141 { "latin1", "iso-8859-1" },
142 { "l1", "iso-8859-1" },
143 { "IBM819", "iso-8859-1" },
144 { "CP819", "iso-8859-1" },
145 { "csISOLatin1", "iso-8859-1" },
146
147 { "ISO_8859-2:1987", "iso-8859-2" },
148 { "iso-ir-101", "iso-8859-2" },
149 { "iso_8859-2", "iso-8859-2" },
150 { "latin2", "iso-8859-2" },
151 { "l2", "iso-8859-2" },
152 { "csISOLatin2", "iso-8859-2" },
153
154 { "ISO_8859-3:1988", "iso-8859-3" },
155 { "iso-ir-109", "iso-8859-3" },
156 { "ISO_8859-3", "iso-8859-3" },
157 { "latin3", "iso-8859-3" },
158 { "l3", "iso-8859-3" },
159 { "csISOLatin3", "iso-8859-3" },
160
161 { "ISO_8859-4:1988", "iso-8859-4" },
162 { "iso-ir-110", "iso-8859-4" },
163 { "ISO_8859-4", "iso-8859-4" },
164 { "latin4", "iso-8859-4" },
165 { "l4", "iso-8859-4" },
166 { "csISOLatin4", "iso-8859-4" },
167
168 { "ISO_8859-6:1987", "iso-8859-6" },
169 { "iso-ir-127", "iso-8859-6" },
170 { "iso_8859-6", "iso-8859-6" },
171 { "ECMA-114", "iso-8859-6" },
172 { "ASMO-708", "iso-8859-6" },
173 { "arabic", "iso-8859-6" },
174 { "csISOLatinArabic", "iso-8859-6" },
175
176 { "ISO_8859-7:1987", "iso-8859-7" },
177 { "iso-ir-126", "iso-8859-7" },
178 { "ISO_8859-7", "iso-8859-7" },
179 { "ELOT_928", "iso-8859-7" },
180 { "ECMA-118", "iso-8859-7" },
181 { "greek", "iso-8859-7" },
182 { "greek8", "iso-8859-7" },
183 { "csISOLatinGreek", "iso-8859-7" },
184
185 { "ISO_8859-8:1988", "iso-8859-8" },
186 { "iso-ir-138", "iso-8859-8" },
187 { "ISO_8859-8", "iso-8859-8" },
188 { "hebrew", "iso-8859-8" },
189 { "csISOLatinHebrew", "iso-8859-8" },
190
191 { "ISO_8859-5:1988", "iso-8859-5" },
192 { "iso-ir-144", "iso-8859-5" },
193 { "ISO_8859-5", "iso-8859-5" },
194 { "cyrillic", "iso-8859-5" },
195 { "csISOLatinCyrillic", "iso-8859-5" },
196
197 { "ISO_8859-9:1989", "iso-8859-9" },
198 { "iso-ir-148", "iso-8859-9" },
199 { "ISO_8859-9", "iso-8859-9" },
200 { "latin5", "iso-8859-9" }, /* this is not a bug */
201 { "l5", "iso-8859-9" },
202 { "csISOLatin5", "iso-8859-9" },
203
204 { "ISO_8859-10:1992", "iso-8859-10" },
205 { "iso-ir-157", "iso-8859-10" },
206 { "latin6", "iso-8859-10" }, /* this is not a bug */
207 { "l6", "iso-8859-10" },
208 { "csISOLatin6", "iso-8859-10" },
209
210 { "csKOI8r", "koi8-r" },
211
212 { "MS_Kanji", "Shift_JIS" }, /* Note the underscore! */
213 { "csShiftJis", "Shift_JIS" },
214
215 { "Extended_UNIX_Code_Packed_Format_for_Japanese",
216 "euc-jp" },
217 { "csEUCPkdFmtJapanese", "euc-jp" },
218
219 { "csGB2312", "gb2312" },
220 { "csbig5", "big5" },
221
222 /* End of official brain damage.
223 * What follows has been taken from glibc's localedata files. */
224
225 { "iso_8859-13", "iso-8859-13" },
226 { "iso-ir-179", "iso-8859-13" },
227 { "latin7", "iso-8859-13" }, /* this is not a bug */
228 { "l7", "iso-8859-13" },
229
230 { "iso_8859-14", "iso-8859-14" },
231 { "latin8", "iso-8859-14" }, /* this is not a bug */
232 { "l8", "iso-8859-14" },
233
234 { "iso_8859-15", "iso-8859-15" },
235 { "latin9", "iso-8859-15" }, /* this is not a bug */
236
237 /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
238 { "latin0", "iso-8859-15" }, /* this is not a bug */
239
240 { "iso_8859-16", "iso-8859-16" },
241 { "latin10", "iso-8859-16" }, /* this is not a bug */
242
243 { "646", "us-ascii" },
244
245 /* http://www.sun.com/software/white-papers/wp-unicode/ */
246
247 { "eucJP", "euc-jp" },
248 { "PCK", "Shift_JIS" },
249 { "ko_KR-euc", "euc-kr" },
250 { "zh_TW-big5", "big5" },
251
252 /* seems to be common on some systems */
253
254 { "sjis", "Shift_JIS" },
255 { "euc-jp-ms", "eucJP-ms" },
256
257 /* If you happen to encounter system-specific brain-damage with respect to
258 * character set naming, please add it above this comment, and submit a patch
259 * to <neomutt-devel@neomutt.org> */
260
261 { NULL, NULL },
262 // clang-format on
263};
264
269static struct Lookup *lookup_new(void)
270{
271 return mutt_mem_calloc(1, sizeof(struct Lookup));
272}
273
278static void lookup_free(struct Lookup **ptr)
279{
280 if (!ptr || !*ptr)
281 return;
282
283 struct Lookup *l = *ptr;
284 FREE(&l->replacement);
285 FREE(&l->regex.pattern);
286 if (l->regex.regex)
287 regfree(l->regex.regex);
288 FREE(&l->regex.regex);
289 FREE(&l->regex);
290
291 FREE(ptr);
292}
293
303static const char *lookup_charset(enum LookupType type, const char *cs)
304{
305 if (!cs)
306 return NULL;
307
308 struct Lookup *l = NULL;
309
310 TAILQ_FOREACH(l, &Lookups, entries)
311 {
312 if (l->type != type)
313 continue;
314 if (mutt_regex_match(&l->regex, cs))
315 return l->replacement;
316 }
317 return NULL;
318}
319
331int mutt_ch_convert_nonmime_string(const struct Slist *const assumed_charset,
332 const char *charset, char **ps)
333{
334 if (!ps)
335 return -1;
336
337 char *u = *ps;
338 const size_t ulen = mutt_str_len(u);
339 if (ulen == 0)
340 return 0;
341
342 const struct ListNode *np = NULL;
343 STAILQ_FOREACH(np, &assumed_charset->head, entries)
344 {
345 char const *c = np->data;
346 size_t n = mutt_str_len(c);
347 char *fromcode = mutt_mem_malloc(n + 1);
348 mutt_str_copy(fromcode, c, n + 1);
349 char *s = mutt_strn_dup(u, ulen);
350 int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);
351 FREE(&fromcode);
352 if (m == 0)
353 {
354 FREE(ps);
355 *ps = s;
356 return 0;
357 }
358 FREE(&s);
359 }
361 charset, MUTT_ICONV_HOOK_FROM);
362 return -1;
363}
364
374void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
375{
376 if (!buf || !name)
377 return;
378
379 char in[1024] = { 0 };
380 char scratch[1024 + 10] = { 0 };
381
382 mutt_str_copy(in, name, sizeof(in));
383 char *ext = strchr(in, '/');
384 if (ext)
385 *ext++ = '\0';
386
387 if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
388 {
389 mutt_str_copy(buf, "utf-8", buflen);
390 goto out;
391 }
392
393 /* catch some common iso-8859-something misspellings */
394 size_t plen;
395 if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
396 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
397 else if ((plen = mutt_istr_startswith(in, "8859-")))
398 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
399 else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
400 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
401 else if ((plen = mutt_istr_startswith(in, "iso8859-")))
402 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
403 else
404 mutt_str_copy(scratch, in, sizeof(scratch));
405
406 for (size_t i = 0; PreferredMimeNames[i].key; i++)
407 {
408 if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
409 {
410 mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
411 goto out;
412 }
413 }
414
415 mutt_str_copy(buf, scratch, buflen);
416
417 /* for cosmetics' sake, transform to lowercase. */
418 for (char *p = buf; *p; p++)
419 *p = tolower(*p);
420
421out:
422 if (ext && *ext)
423 {
424 mutt_str_cat(buf, buflen, "/");
425 mutt_str_cat(buf, buflen, ext);
426 }
427}
428
441bool mutt_ch_chscmp(const char *cs1, const char *cs2)
442{
443 if (!cs1 || !cs2)
444 return false;
445
446 char buf[256] = { 0 };
447
448 mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
449
450 int len1 = mutt_str_len(buf);
451 int len2 = mutt_str_len(cs2);
452
453 return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
454 ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
455}
456
464const char *mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
465{
466 static char fcharset[128];
467 const char *c = NULL;
468
469 if (assumed_charset && (assumed_charset->count > 0))
470 c = STAILQ_FIRST(&assumed_charset->head)->data;
471 else
472 c = "us-ascii";
473
474 mutt_str_copy(fcharset, c, sizeof(fcharset));
475 return fcharset;
476}
477
486{
487 char buf[1024] = { 0 };
488
489 mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
490
491 if (buf[0] != '\0')
492 return mutt_str_dup(buf);
493
494 return mutt_str_dup("iso-8859-1");
495}
496
508bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
509 const char *replace, struct Buffer *err)
510{
511 if (!pat || !replace)
512 return false;
513
514 regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
515 int rc = REG_COMP(rx, pat, REG_ICASE);
516 if (rc != 0)
517 {
518 regerror(rc, rx, err->data, err->dsize);
519 FREE(&rx);
520 return false;
521 }
522
523 struct Lookup *l = lookup_new();
524 l->type = type;
525 l->replacement = mutt_str_dup(replace);
526 l->regex.pattern = mutt_str_dup(pat);
527 l->regex.regex = rx;
528 l->regex.pat_not = false;
529
530 TAILQ_INSERT_TAIL(&Lookups, l, entries);
531
532 return true;
533}
534
541{
542 struct Lookup *l = NULL;
543 struct Lookup *tmp = NULL;
544
545 TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
546 {
547 TAILQ_REMOVE(&Lookups, l, entries);
548 lookup_free(&l);
549 }
550}
551
561const char *mutt_ch_charset_lookup(const char *chs)
562{
564}
565
593iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
594{
595 char tocode1[128] = { 0 };
596 char fromcode1[128] = { 0 };
597 const char *tocode2 = NULL, *fromcode2 = NULL;
598 const char *tmp = NULL;
599
600 /* transform to MIME preferred charset names */
601 mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
602 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
603
604 /* maybe apply charset-hooks and recanonicalise fromcode,
605 * but only when caller asked us to sanitize a potentially wrong
606 * charset name incoming from the wild exterior. */
607 if (flags & MUTT_ICONV_HOOK_FROM)
608 {
609 tmp = mutt_ch_charset_lookup(fromcode1);
610 if (tmp)
611 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
612 }
613
614 /* check if we have this pair cached already */
615 for (int i = 0; i < IconvCacheUsed; ++i)
616 {
617 if (strcmp(tocode1, IconvCache[i].tocode1) == 0 &&
618 strcmp(fromcode1, IconvCache[i].fromcode1) == 0)
619 {
620 iconv_t cd = IconvCache[i].cd;
621
622 /* make room for this one at the top */
623 struct IconvCacheEntry top = IconvCache[i];
624 for (int j = i; j-- > 0;)
625 {
626 IconvCache[j + 1] = IconvCache[j];
627 }
628 IconvCache[0] = top;
629
630 if (iconv_t_valid(cd))
631 {
632 /* reset state */
633 iconv(cd, NULL, NULL, NULL, NULL);
634 }
635 return cd;
636 }
637 }
638
639 /* not found in cache */
640 /* always apply iconv-hooks to suit system's iconv tastes */
641 tocode2 = mutt_ch_iconv_lookup(tocode1);
642 tocode2 = tocode2 ? tocode2 : tocode1;
643 fromcode2 = mutt_ch_iconv_lookup(fromcode1);
644 fromcode2 = fromcode2 ? fromcode2 : fromcode1;
645
646 /* call system iconv with names it appreciates */
647 iconv_t cd = iconv_open(tocode2, fromcode2);
648
650 {
651 mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n",
654 /* get rid of the oldest entry */
658 {
659 iconv_close(IconvCache[IconvCacheUsed - 1].cd);
660 }
662 }
663
664 /* make room for this one at the top */
665 for (int j = IconvCacheUsed; j-- > 0;)
666 {
667 IconvCache[j + 1] = IconvCache[j];
668 }
669
671
672 mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1);
673 IconvCache[0].fromcode1 = strdup(fromcode1);
674 IconvCache[0].tocode1 = strdup(tocode1);
675 IconvCache[0].cd = cd;
676
677 return cd;
678}
679
696size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
697 char **outbuf, size_t *outbytesleft, const char **inrepls,
698 const char *outrepl, int *iconverrno)
699{
700 size_t rc = 0;
701 const char *ib = *inbuf;
702 size_t ibl = *inbytesleft;
703 char *ob = *outbuf;
704 size_t obl = *outbytesleft;
705
706 while (true)
707 {
708 errno = 0;
709 const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
710 if (ret1 != ICONV_ILLEGAL_SEQ)
711 rc += ret1;
712 if (iconverrno)
713 *iconverrno = errno;
714
715 if (ibl && obl && (errno == EILSEQ))
716 {
717 if (inrepls)
718 {
719 /* Try replacing the input */
720 const char **t = NULL;
721 for (t = inrepls; *t; t++)
722 {
723 const char *ib1 = *t;
724 size_t ibl1 = strlen(*t);
725 char *ob1 = ob;
726 size_t obl1 = obl;
727 iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
728 if (ibl1 == 0)
729 {
730 ib++;
731 ibl--;
732 ob = ob1;
733 obl = obl1;
734 rc++;
735 break;
736 }
737 }
738 if (*t)
739 continue;
740 }
741 /* Replace the output */
742 if (!outrepl)
743 outrepl = "?";
744 iconv(cd, NULL, NULL, &ob, &obl);
745 if (obl)
746 {
747 int n = strlen(outrepl);
748 if (n > obl)
749 {
750 outrepl = "?";
751 n = 1;
752 }
753 memcpy(ob, outrepl, n);
754 ib++;
755 ibl--;
756 ob += n;
757 obl -= n;
758 rc++;
759 iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
760 continue;
761 }
762 }
763 *inbuf = ib;
764 *inbytesleft = ibl;
765 *outbuf = ob;
766 *outbytesleft = obl;
767 return rc;
768 }
769}
770
780const char *mutt_ch_iconv_lookup(const char *chs)
781{
783}
784
795int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
796{
797 if (!s || !from || !to)
798 return -1;
799
800 int rc = 0;
801 iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
802 if (!iconv_t_valid(cd))
803 return -1;
804
805 size_t outlen = MB_LEN_MAX * slen;
806 char *out = mutt_mem_malloc(outlen + 1);
807 char *saved_out = out;
808
809 const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
810 if (convlen == ICONV_ILLEGAL_SEQ)
811 rc = errno;
812
813 FREE(&saved_out);
814 return rc;
815}
816
830int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
831{
832 if (!ps)
833 return -1;
834
835 char *s = *ps;
836
837 if (!s || (*s == '\0'))
838 return 0;
839
840 if (!to || !from)
841 return -1;
842
843 const char *repls[] = { "\357\277\275", "?", 0 };
844 int rc = 0;
845
846 iconv_t cd = mutt_ch_iconv_open(to, from, flags);
847 if (!iconv_t_valid(cd))
848 return -1;
849
850 const char **inrepls = NULL;
851 const char *outrepl = NULL;
852
853 if (mutt_ch_is_utf8(to))
854 outrepl = "\357\277\275";
855 else if (mutt_ch_is_utf8(from))
856 inrepls = repls;
857 else
858 outrepl = "?";
859
860 const char *ib = s;
861 size_t ibl = strlen(s);
862 if (ibl >= (SIZE_MAX / MB_LEN_MAX))
863 {
864 return -1;
865 }
866 size_t obl = MB_LEN_MAX * ibl;
867 char *buf = mutt_mem_malloc(obl + 1);
868 char *ob = buf;
869
870 mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
871 iconv(cd, 0, 0, &ob, &obl);
872
873 *ob = '\0';
874
875 FREE(ps);
876 *ps = buf;
877
878 mutt_str_adjust(ps);
879 return rc;
880}
881
893bool mutt_ch_check_charset(const char *cs, bool strict)
894{
895 if (!cs)
896 return false;
897
898 if (mutt_ch_is_utf8(cs))
899 return true;
900
901 if (!strict)
902 {
903 for (int i = 0; PreferredMimeNames[i].key; i++)
904 {
905 if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
907 {
908 return true;
909 }
910 }
911 }
912
913 iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
914 if (iconv_t_valid(cd))
915 {
916 return true;
917 }
918
919 return false;
920}
921
932struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
933{
934 struct FgetConv *fc = NULL;
935 iconv_t cd = ICONV_T_INVALID;
936
937 if (from && to)
938 cd = mutt_ch_iconv_open(to, from, flags);
939
940 if (iconv_t_valid(cd))
941 {
942 static const char *repls[] = { "\357\277\275", "?", 0 };
943
944 fc = mutt_mem_malloc(sizeof(struct FgetConv));
945 fc->p = fc->bufo;
946 fc->ob = fc->bufo;
947 fc->ib = fc->bufi;
948 fc->ibl = 0;
949 fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
950 }
951 else
952 {
953 fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
954 }
955 fc->fp = fp;
956 fc->cd = cd;
957 return fc;
958}
959
965{
966 if (!ptr || !*ptr)
967 return;
968
969 FREE(ptr);
970}
971
983{
984 if (!fc)
985 return EOF;
986 if (!iconv_t_valid(fc->cd))
987 return fgetc(fc->fp);
988 if (!fc->p)
989 return EOF;
990 if (fc->p < fc->ob)
991 return (unsigned char) *(fc->p)++;
992
993 /* Try to convert some more */
994 fc->p = fc->bufo;
995 fc->ob = fc->bufo;
996 if (fc->ibl)
997 {
998 size_t obl = sizeof(fc->bufo);
999 iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
1000 if (fc->p < fc->ob)
1001 return (unsigned char) *(fc->p)++;
1002 }
1003
1004 /* If we trusted iconv a bit more, we would at this point
1005 * ask why it had stopped converting ... */
1006
1007 /* Try to read some more */
1008 if ((fc->ibl == sizeof(fc->bufi)) ||
1009 (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
1010 {
1011 fc->p = 0;
1012 return EOF;
1013 }
1014 if (fc->ibl)
1015 memcpy(fc->bufi, fc->ib, fc->ibl);
1016 fc->ib = fc->bufi;
1017 fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
1018
1019 /* Try harder this time to convert some */
1020 if (fc->ibl)
1021 {
1022 size_t obl = sizeof(fc->bufo);
1023 mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
1024 fc->inrepls, 0, NULL);
1025 if (fc->p < fc->ob)
1026 return (unsigned char) *(fc->p)++;
1027 }
1028
1029 /* Either the file has finished or one of the buffers is too small */
1030 fc->p = 0;
1031 return EOF;
1032}
1033
1044char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
1045{
1046 if (!buf)
1047 return NULL;
1048
1049 size_t r;
1050 for (r = 0; (r + 1) < buflen;)
1051 {
1052 const int c = mutt_ch_fgetconv(fc);
1053 if (c == EOF)
1054 break;
1055 buf[r++] = (char) c;
1056 if (c == '\n')
1057 break;
1058 }
1059 buf[r] = '\0';
1060
1061 if (r > 0)
1062 return buf;
1063
1064 return NULL;
1065}
1066
1077void mutt_ch_set_charset(const char *charset)
1078{
1079 char buf[256] = { 0 };
1080
1081 mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1082
1083 if (mutt_ch_is_utf8(buf))
1084 {
1085 CharsetIsUtf8 = true;
1086 ReplacementChar = 0xfffd; /* replacement character */
1087 }
1088 else
1089 {
1090 CharsetIsUtf8 = false;
1091 ReplacementChar = '?';
1092 }
1093
1094#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1095 bind_textdomain_codeset(PACKAGE, buf);
1096#endif
1097}
1098
1110char *mutt_ch_choose(const char *fromcode, const struct Slist *charsets,
1111 const char *u, size_t ulen, char **d, size_t *dlen)
1112{
1113 if (!fromcode || !charsets)
1114 return NULL;
1115
1116 char *e = NULL, *tocode = NULL;
1117 size_t elen = 0, bestn = 0;
1118
1119 const struct ListNode *np = NULL;
1120 STAILQ_FOREACH(np, &charsets->head, entries)
1121 {
1122 char *t = mutt_str_dup(np->data);
1123 if (!t)
1124 continue;
1125
1126 size_t n = mutt_str_len(t);
1127 char *s = mutt_strn_dup(u, ulen);
1128 const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1129 mutt_ch_check(s, ulen, fromcode, t);
1130 if (rc)
1131 {
1132 FREE(&t);
1133 FREE(&s);
1134 continue;
1135 }
1136 size_t slen = mutt_str_len(s);
1137
1138 if (!tocode || (n < bestn))
1139 {
1140 bestn = n;
1141 FREE(&tocode);
1142 tocode = t;
1143 if (d)
1144 {
1145 FREE(&e);
1146 e = s;
1147 }
1148 else
1149 {
1150 FREE(&s);
1151 }
1152 elen = slen;
1153 }
1154 else
1155 {
1156 FREE(&t);
1157 FREE(&s);
1158 }
1159 }
1160 if (tocode)
1161 {
1162 if (d)
1163 *d = e;
1164 if (dlen)
1165 *dlen = elen;
1166
1167 char canonical_buf[1024] = { 0 };
1168 mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1169 mutt_str_replace(&tocode, canonical_buf);
1170 }
1171 return tocode;
1172}
1173
1178{
1179 for (int i = 0; i < IconvCacheUsed; ++i)
1180 {
1181 FREE(&IconvCache[i].fromcode1);
1182 FREE(&IconvCache[i].tocode1);
1183 if (iconv_t_valid(IconvCache[i].cd))
1184 {
1185 iconv_close(IconvCache[i].cd);
1186 }
1187 }
1188 IconvCacheUsed = 0;
1189}
General purpose object for storing and parsing strings.
#define mutt_debug(LEVEL,...)
Definition: logging2.h:89
Singly-linked list type.
Logging Dispatcher.
@ LL_DEBUG2
Log at debug level 2.
Definition: logging2.h:44
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
Memory management wrappers.
#define FREE(x)
Definition: memory.h:45
#define MIN(a, b)
Definition: memory.h:32
bool mutt_ch_check_charset(const char *cs, bool strict)
Does iconv understand a character set?
Definition: charset.c:893
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:696
void mutt_ch_lookup_remove(void)
Remove all the character set lookups.
Definition: charset.c:540
static int IconvCacheUsed
Number of iconv descriptors in the cache.
Definition: charset.c:100
char * mutt_ch_choose(const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
Figure the best charset to encode a string.
Definition: charset.c:1110
int mutt_ch_convert_nonmime_string(const struct Slist *const assumed_charset, const char *charset, char **ps)
Try to convert a string using a list of character sets.
Definition: charset.c:331
static struct LookupList Lookups
Lookup table of preferred character set names.
Definition: charset.c:83
char * mutt_ch_get_langinfo_charset(void)
Get the user's choice of character set.
Definition: charset.c:485
static const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:121
bool mutt_ch_lookup_add(enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
Add a new character set lookup.
Definition: charset.c:508
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:374
void mutt_ch_cache_cleanup(void)
Clean up the cached iconv handles and charset strings.
Definition: charset.c:1177
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:780
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:830
void mutt_ch_set_charset(const char *charset)
Update the records for a new character set.
Definition: charset.c:1077
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:66
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:303
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:795
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:561
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:269
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:982
#define ICONV_CACHE_SIZE
Max size of the iconv cache.
Definition: charset.c:96
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:278
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:61
#define EILSEQ
Definition: charset.c:55
struct FgetConv * mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
Prepare a file for charset conversion.
Definition: charset.c:932
static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
Cache of iconv conversion descriptors.
Definition: charset.c:98
char * mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
Convert a file's charset into a string buffer.
Definition: charset.c:1044
bool mutt_ch_chscmp(const char *cs1, const char *cs2)
Are the names of two character sets equivalent?
Definition: charset.c:441
void mutt_ch_fgetconv_close(struct FgetConv **ptr)
Close an fgetconv handle.
Definition: charset.c:964
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:593
const char * mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
Get the default character set.
Definition: charset.c:464
Conversion between different character encodings.
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:73
#define ICONV_T_INVALID
Error value for iconv functions.
Definition: charset.h:101
#define mutt_ch_is_utf8(str)
Definition: charset.h:97
LookupType
Types of character set lookups.
Definition: charset.h:67
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:69
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:68
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:72
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition: charset.h:104
static bool iconv_t_valid(const iconv_t cd)
Is the conversion descriptor valid?
Definition: charset.h:113
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:639
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:429
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:721
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:253
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:348
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:545
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:630
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:242
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:502
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:329
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:268
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:725
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define STAILQ_FIRST(head)
Definition: queue.h:350
#define TAILQ_HEAD(name, type)
Definition: queue.h:623
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
#define TAILQ_HEAD_INITIALIZER(head)
Definition: queue.h:637
#define TAILQ_ENTRY(type)
Definition: queue.h:640
Manage regular expressions.
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:49
A separated list of strings.
String manipulation functions.
String manipulation buffer.
Definition: buffer.h:36
size_t dsize
Length of data.
Definition: buffer.h:39
char * data
Pointer to data.
Definition: buffer.h:37
A dummy converter.
Definition: charset.h:58
Cursor for converting a file's encoding.
Definition: charset.h:42
char bufi[512]
Definition: charset.h:45
iconv_t cd
iconv conversion descriptor
Definition: charset.h:44
char bufo[512]
Definition: charset.h:46
size_t ibl
Definition: charset.h:50
FILE * fp
Definition: charset.h:43
char * p
Definition: charset.h:47
const char ** inrepls
Definition: charset.h:51
char * ib
Definition: charset.h:49
char * ob
Definition: charset.h:48
Cached iconv conversion descriptor.
Definition: charset.c:89
char * tocode1
Destination character set.
Definition: charset.c:91
char * fromcode1
Source character set.
Definition: charset.c:90
iconv_t cd
iconv conversion descriptor
Definition: charset.c:92
A List node for strings.
Definition: list.h:35
char * data
String.
Definition: list.h:36
Regex to String lookup table.
Definition: charset.c:74
char * replacement
Alternative charset to use.
Definition: charset.c:77
enum LookupType type
Lookup type.
Definition: charset.c:75
struct Regex regex
Regular expression.
Definition: charset.c:76
MIME name lookup entry.
Definition: charset.c:106
const char * key
Definition: charset.c:107
const char * pref
Definition: charset.c:108
Cached regular expression.
Definition: regex3.h:85
char * pattern
printable version
Definition: regex3.h:86
bool pat_not
do not match
Definition: regex3.h:88
regex_t * regex
compiled expression
Definition: regex3.h:87
String list.
Definition: slist.h:37
struct ListHead head
List containing values.
Definition: slist.h:38
size_t count
Number of values in list.
Definition: slist.h:39