NeoMutt  2024-12-12-29-gecf7a5
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
charset.c
Go to the documentation of this file.
1
32#include "config.h"
33#include <errno.h>
34#include <iconv.h>
35#include <langinfo.h>
36#include <limits.h>
37#include <stdbool.h>
38#include <stdio.h>
39#include <string.h>
40#include "charset.h"
41#include "buffer.h"
42#include "list.h"
43#include "logging2.h"
44#include "memory.h"
45#include "pool.h"
46#include "queue.h"
47#include "regex3.h"
48#include "slist.h"
49#include "string2.h"
50#ifdef ENABLE_NLS
51#include <libintl.h>
52#endif
53
54#ifndef EILSEQ
55#define EILSEQ EINVAL
56#endif
57
61wchar_t ReplacementChar = '?';
62
66bool CharsetIsUtf8 = false;
67
73struct Lookup
74{
76 struct Regex regex;
78 TAILQ_ENTRY(Lookup) entries;
79};
80TAILQ_HEAD(LookupList, Lookup);
81
83static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
84
89{
90 char *fromcode1;
91 char *tocode1;
92 iconv_t cd;
93};
94
96#define ICONV_CACHE_SIZE 16
100static int IconvCacheUsed = 0;
101
106{
107 const char *key;
108 const char *pref;
109};
110
121static const struct MimeNames PreferredMimeNames[] = {
122 // clang-format off
123 { "ansi_x3.4-1968", "us-ascii" },
124 { "iso-ir-6", "us-ascii" },
125 { "iso_646.irv:1991", "us-ascii" },
126 { "ascii", "us-ascii" },
127 { "iso646-us", "us-ascii" },
128 { "us", "us-ascii" },
129 { "ibm367", "us-ascii" },
130 { "cp367", "us-ascii" },
131 { "csASCII", "us-ascii" },
132
133 { "csISO2022KR", "iso-2022-kr" },
134 { "csEUCKR", "euc-kr" },
135 { "csISO2022JP", "iso-2022-jp" },
136 { "csISO2022JP2", "iso-2022-jp-2" },
137
138 { "ISO_8859-1:1987", "iso-8859-1" },
139 { "iso-ir-100", "iso-8859-1" },
140 { "iso_8859-1", "iso-8859-1" },
141 { "latin1", "iso-8859-1" },
142 { "l1", "iso-8859-1" },
143 { "IBM819", "iso-8859-1" },
144 { "CP819", "iso-8859-1" },
145 { "csISOLatin1", "iso-8859-1" },
146
147 { "ISO_8859-2:1987", "iso-8859-2" },
148 { "iso-ir-101", "iso-8859-2" },
149 { "iso_8859-2", "iso-8859-2" },
150 { "latin2", "iso-8859-2" },
151 { "l2", "iso-8859-2" },
152 { "csISOLatin2", "iso-8859-2" },
153
154 { "ISO_8859-3:1988", "iso-8859-3" },
155 { "iso-ir-109", "iso-8859-3" },
156 { "ISO_8859-3", "iso-8859-3" },
157 { "latin3", "iso-8859-3" },
158 { "l3", "iso-8859-3" },
159 { "csISOLatin3", "iso-8859-3" },
160
161 { "ISO_8859-4:1988", "iso-8859-4" },
162 { "iso-ir-110", "iso-8859-4" },
163 { "ISO_8859-4", "iso-8859-4" },
164 { "latin4", "iso-8859-4" },
165 { "l4", "iso-8859-4" },
166 { "csISOLatin4", "iso-8859-4" },
167
168 { "ISO_8859-6:1987", "iso-8859-6" },
169 { "iso-ir-127", "iso-8859-6" },
170 { "iso_8859-6", "iso-8859-6" },
171 { "ECMA-114", "iso-8859-6" },
172 { "ASMO-708", "iso-8859-6" },
173 { "arabic", "iso-8859-6" },
174 { "csISOLatinArabic", "iso-8859-6" },
175
176 { "ISO_8859-7:1987", "iso-8859-7" },
177 { "iso-ir-126", "iso-8859-7" },
178 { "ISO_8859-7", "iso-8859-7" },
179 { "ELOT_928", "iso-8859-7" },
180 { "ECMA-118", "iso-8859-7" },
181 { "greek", "iso-8859-7" },
182 { "greek8", "iso-8859-7" },
183 { "csISOLatinGreek", "iso-8859-7" },
184
185 { "ISO_8859-8:1988", "iso-8859-8" },
186 { "iso-ir-138", "iso-8859-8" },
187 { "ISO_8859-8", "iso-8859-8" },
188 { "hebrew", "iso-8859-8" },
189 { "csISOLatinHebrew", "iso-8859-8" },
190
191 { "ISO_8859-5:1988", "iso-8859-5" },
192 { "iso-ir-144", "iso-8859-5" },
193 { "ISO_8859-5", "iso-8859-5" },
194 { "cyrillic", "iso-8859-5" },
195 { "csISOLatinCyrillic", "iso-8859-5" },
196
197 { "ISO_8859-9:1989", "iso-8859-9" },
198 { "iso-ir-148", "iso-8859-9" },
199 { "ISO_8859-9", "iso-8859-9" },
200 { "latin5", "iso-8859-9" }, /* this is not a bug */
201 { "l5", "iso-8859-9" },
202 { "csISOLatin5", "iso-8859-9" },
203
204 { "ISO_8859-10:1992", "iso-8859-10" },
205 { "iso-ir-157", "iso-8859-10" },
206 { "latin6", "iso-8859-10" }, /* this is not a bug */
207 { "l6", "iso-8859-10" },
208 { "csISOLatin6", "iso-8859-10" },
209
210 { "csKOI8r", "koi8-r" },
211
212 { "MS_Kanji", "Shift_JIS" }, /* Note the underscore! */
213 { "csShiftJis", "Shift_JIS" },
214
215 { "Extended_UNIX_Code_Packed_Format_for_Japanese",
216 "euc-jp" },
217 { "csEUCPkdFmtJapanese", "euc-jp" },
218
219 { "csGB2312", "gb2312" },
220 { "csbig5", "big5" },
221
222 /* End of official brain damage.
223 * What follows has been taken from glibc's localedata files. */
224
225 { "iso_8859-13", "iso-8859-13" },
226 { "iso-ir-179", "iso-8859-13" },
227 { "latin7", "iso-8859-13" }, /* this is not a bug */
228 { "l7", "iso-8859-13" },
229
230 { "iso_8859-14", "iso-8859-14" },
231 { "latin8", "iso-8859-14" }, /* this is not a bug */
232 { "l8", "iso-8859-14" },
233
234 { "iso_8859-15", "iso-8859-15" },
235 { "latin9", "iso-8859-15" }, /* this is not a bug */
236
237 /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
238 { "latin0", "iso-8859-15" }, /* this is not a bug */
239
240 { "iso_8859-16", "iso-8859-16" },
241 { "latin10", "iso-8859-16" }, /* this is not a bug */
242
243 { "646", "us-ascii" },
244
245 /* http://www.sun.com/software/white-papers/wp-unicode/ */
246
247 { "eucJP", "euc-jp" },
248 { "PCK", "Shift_JIS" },
249 { "ko_KR-euc", "euc-kr" },
250 { "zh_TW-big5", "big5" },
251
252 /* seems to be common on some systems */
253
254 { "sjis", "Shift_JIS" },
255 { "euc-jp-ms", "eucJP-ms" },
256
257 /* If you happen to encounter system-specific brain-damage with respect to
258 * character set naming, please add it above this comment, and submit a patch
259 * to <neomutt-devel@neomutt.org> */
260
261 { NULL, NULL },
262 // clang-format on
263};
264
269static struct Lookup *lookup_new(void)
270{
271 return MUTT_MEM_CALLOC(1, struct Lookup);
272}
273
278static void lookup_free(struct Lookup **ptr)
279{
280 if (!ptr || !*ptr)
281 return;
282
283 struct Lookup *l = *ptr;
284 FREE(&l->replacement);
285 FREE(&l->regex.pattern);
286 if (l->regex.regex)
287 regfree(l->regex.regex);
288 FREE(&l->regex.regex);
289 FREE(&l->regex);
290
291 FREE(ptr);
292}
293
303static const char *lookup_charset(enum LookupType type, const char *cs)
304{
305 if (!cs)
306 return NULL;
307
308 struct Lookup *l = NULL;
309
310 TAILQ_FOREACH(l, &Lookups, entries)
311 {
312 if (l->type != type)
313 continue;
314 if (mutt_regex_match(&l->regex, cs))
315 return l->replacement;
316 }
317 return NULL;
318}
319
331int mutt_ch_convert_nonmime_string(const struct Slist *const assumed_charset,
332 const char *charset, char **ps)
333{
334 if (!ps)
335 return -1;
336
337 char *u = *ps;
338 const size_t ulen = mutt_str_len(u);
339 if (ulen == 0)
340 return 0;
341
342 const struct ListNode *np = NULL;
343 STAILQ_FOREACH(np, &assumed_charset->head, entries)
344 {
345 char const *c = np->data;
346 size_t n = mutt_str_len(c);
347 char *fromcode = MUTT_MEM_MALLOC(n + 1, char);
348 mutt_str_copy(fromcode, c, n + 1);
349 char *s = mutt_strn_dup(u, ulen);
350 int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);
351 FREE(&fromcode);
352 if (m == 0)
353 {
354 FREE(ps);
355 *ps = s;
356 return 0;
357 }
358 FREE(&s);
359 }
361 charset, MUTT_ICONV_HOOK_FROM);
362 return -1;
363}
364
374void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
375{
376 if (!buf || !name)
377 return;
378
379 char in[1024] = { 0 };
380 char scratch[1024 + 10] = { 0 };
381 struct Buffer *canon = buf_pool_get();
382
383 mutt_str_copy(in, name, sizeof(in));
384 char *ext = strchr(in, '/');
385 if (ext)
386 *ext++ = '\0';
387
388 if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
389 {
390 buf_strcpy(canon, "utf-8");
391 goto out;
392 }
393
394 /* catch some common iso-8859-something misspellings */
395 size_t plen;
396 if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
397 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
398 else if ((plen = mutt_istr_startswith(in, "8859-")))
399 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
400 else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
401 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
402 else if ((plen = mutt_istr_startswith(in, "iso8859-")))
403 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
404 else
405 mutt_str_copy(scratch, in, sizeof(scratch));
406
407 for (size_t i = 0; PreferredMimeNames[i].key; i++)
408 {
409 if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
410 {
411 buf_strcpy(canon, PreferredMimeNames[i].pref);
412 goto out;
413 }
414 }
415
416 buf_strcpy(canon, scratch);
417 buf_lower(canon); // for cosmetics' sake
418
419out:
420 if (ext && (*ext != '\0'))
421 {
422 buf_addch(canon, '/');
423 buf_addstr(canon, ext);
424 }
425
426 mutt_str_copy(buf, buf_string(canon), buflen);
427 buf_pool_release(&canon);
428}
429
442bool mutt_ch_chscmp(const char *cs1, const char *cs2)
443{
444 if (!cs1 || !cs2)
445 return false;
446
447 char buf[256] = { 0 };
448
449 mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
450
451 int len1 = mutt_str_len(buf);
452 int len2 = mutt_str_len(cs2);
453
454 return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
455 ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
456}
457
465const char *mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
466{
467 static char fcharset[128];
468 const char *c = NULL;
469
470 if (assumed_charset && (assumed_charset->count > 0))
471 c = STAILQ_FIRST(&assumed_charset->head)->data;
472 else
473 c = "us-ascii";
474
475 mutt_str_copy(fcharset, c, sizeof(fcharset));
476 return fcharset;
477}
478
487{
488 char buf[1024] = { 0 };
489
490 mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
491
492 if (buf[0] != '\0')
493 return mutt_str_dup(buf);
494
495 return mutt_str_dup("iso-8859-1");
496}
497
509bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
510 const char *replace, struct Buffer *err)
511{
512 if (!pat || !replace)
513 return false;
514
515 regex_t *rx = MUTT_MEM_CALLOC(1, regex_t);
516 int rc = REG_COMP(rx, pat, REG_ICASE);
517 if (rc != 0)
518 {
519 regerror(rc, rx, err->data, err->dsize);
520 FREE(&rx);
521 return false;
522 }
523
524 struct Lookup *l = lookup_new();
525 l->type = type;
526 l->replacement = mutt_str_dup(replace);
527 l->regex.pattern = mutt_str_dup(pat);
528 l->regex.regex = rx;
529 l->regex.pat_not = false;
530
531 TAILQ_INSERT_TAIL(&Lookups, l, entries);
532
533 return true;
534}
535
542{
543 struct Lookup *l = NULL;
544 struct Lookup *tmp = NULL;
545
546 TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
547 {
548 TAILQ_REMOVE(&Lookups, l, entries);
549 lookup_free(&l);
550 }
551}
552
562const char *mutt_ch_charset_lookup(const char *chs)
563{
565}
566
594iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
595{
596 char tocode1[128] = { 0 };
597 char fromcode1[128] = { 0 };
598 const char *tocode2 = NULL, *fromcode2 = NULL;
599 const char *tmp = NULL;
600
601 /* transform to MIME preferred charset names */
602 mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
603 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
604
605 /* maybe apply charset-hooks and recanonicalise fromcode,
606 * but only when caller asked us to sanitize a potentially wrong
607 * charset name incoming from the wild exterior. */
608 if (flags & MUTT_ICONV_HOOK_FROM)
609 {
610 tmp = mutt_ch_charset_lookup(fromcode1);
611 if (tmp)
612 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
613 }
614
615 /* check if we have this pair cached already */
616 for (int i = 0; i < IconvCacheUsed; ++i)
617 {
618 if (strcmp(tocode1, IconvCache[i].tocode1) == 0 &&
619 strcmp(fromcode1, IconvCache[i].fromcode1) == 0)
620 {
621 iconv_t cd = IconvCache[i].cd;
622
623 /* make room for this one at the top */
624 struct IconvCacheEntry top = IconvCache[i];
625 for (int j = i; j-- > 0;)
626 {
627 IconvCache[j + 1] = IconvCache[j];
628 }
629 IconvCache[0] = top;
630
631 if (iconv_t_valid(cd))
632 {
633 /* reset state */
634 iconv(cd, NULL, NULL, NULL, NULL);
635 }
636 return cd;
637 }
638 }
639
640 /* not found in cache */
641 /* always apply iconv-hooks to suit system's iconv tastes */
642 tocode2 = mutt_ch_iconv_lookup(tocode1);
643 tocode2 = tocode2 ? tocode2 : tocode1;
644 fromcode2 = mutt_ch_iconv_lookup(fromcode1);
645 fromcode2 = fromcode2 ? fromcode2 : fromcode1;
646
647 /* call system iconv with names it appreciates */
648 iconv_t cd = iconv_open(tocode2, fromcode2);
649
651 {
652 mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n",
655 /* get rid of the oldest entry */
659 {
660 iconv_close(IconvCache[IconvCacheUsed - 1].cd);
661 }
663 }
664
665 /* make room for this one at the top */
666 for (int j = IconvCacheUsed; j-- > 0;)
667 {
668 IconvCache[j + 1] = IconvCache[j];
669 }
670
672
673 mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1);
674 IconvCache[0].fromcode1 = strdup(fromcode1);
675 IconvCache[0].tocode1 = strdup(tocode1);
676 IconvCache[0].cd = cd;
677
678 return cd;
679}
680
697size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
698 char **outbuf, size_t *outbytesleft, const char **inrepls,
699 const char *outrepl, int *iconverrno)
700{
701 size_t rc = 0;
702 const char *ib = *inbuf;
703 size_t ibl = *inbytesleft;
704 char *ob = *outbuf;
705 size_t obl = *outbytesleft;
706
707 while (true)
708 {
709 errno = 0;
710 const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
711 if (ret1 != ICONV_ILLEGAL_SEQ)
712 rc += ret1;
713 if (iconverrno)
714 *iconverrno = errno;
715
716 if (ibl && obl && (errno == EILSEQ))
717 {
718 if (inrepls)
719 {
720 /* Try replacing the input */
721 const char **t = NULL;
722 for (t = inrepls; *t; t++)
723 {
724 const char *ib1 = *t;
725 size_t ibl1 = strlen(*t);
726 char *ob1 = ob;
727 size_t obl1 = obl;
728 iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
729 if (ibl1 == 0)
730 {
731 ib++;
732 ibl--;
733 ob = ob1;
734 obl = obl1;
735 rc++;
736 break;
737 }
738 }
739 if (*t)
740 continue;
741 }
742 /* Replace the output */
743 if (!outrepl)
744 outrepl = "?";
745 iconv(cd, NULL, NULL, &ob, &obl);
746 if (obl)
747 {
748 int n = strlen(outrepl);
749 if (n > obl)
750 {
751 outrepl = "?";
752 n = 1;
753 }
754 memcpy(ob, outrepl, n);
755 ib++;
756 ibl--;
757 ob += n;
758 obl -= n;
759 rc++;
760 iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
761 continue;
762 }
763 }
764 *inbuf = ib;
765 *inbytesleft = ibl;
766 *outbuf = ob;
767 *outbytesleft = obl;
768 return rc;
769 }
770}
771
781const char *mutt_ch_iconv_lookup(const char *chs)
782{
784}
785
796int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
797{
798 if (!s || !from || !to)
799 return -1;
800
801 int rc = 0;
802 iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
803 if (!iconv_t_valid(cd))
804 return -1;
805
806 size_t outlen = MB_LEN_MAX * slen;
807 char *out = MUTT_MEM_MALLOC(outlen + 1, char);
808 char *saved_out = out;
809
810 const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
811 if (convlen == ICONV_ILLEGAL_SEQ)
812 rc = errno;
813
814 FREE(&saved_out);
815 return rc;
816}
817
831int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
832{
833 if (!ps)
834 return -1;
835
836 char *s = *ps;
837
838 if (!s || (*s == '\0'))
839 return 0;
840
841 if (!to || !from)
842 return -1;
843
844 const char *repls[] = { "\357\277\275", "?", 0 };
845 int rc = 0;
846
847 iconv_t cd = mutt_ch_iconv_open(to, from, flags);
848 if (!iconv_t_valid(cd))
849 return -1;
850
851 const char **inrepls = NULL;
852 const char *outrepl = NULL;
853
854 if (mutt_ch_is_utf8(to))
855 outrepl = "\357\277\275";
856 else if (mutt_ch_is_utf8(from))
857 inrepls = repls;
858 else
859 outrepl = "?";
860
861 const char *ib = s;
862 size_t ibl = strlen(s);
863 if (ibl >= (SIZE_MAX / MB_LEN_MAX))
864 {
865 return -1;
866 }
867 size_t obl = MB_LEN_MAX * ibl;
868 char *buf = MUTT_MEM_MALLOC(obl + 1, char);
869 char *ob = buf;
870
871 mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
872 iconv(cd, 0, 0, &ob, &obl);
873
874 *ob = '\0';
875
876 FREE(ps);
877 *ps = buf;
878
879 mutt_str_adjust(ps);
880 return rc;
881}
882
894bool mutt_ch_check_charset(const char *cs, bool strict)
895{
896 if (!cs)
897 return false;
898
899 if (mutt_ch_is_utf8(cs))
900 return true;
901
902 if (!strict)
903 {
904 for (int i = 0; PreferredMimeNames[i].key; i++)
905 {
906 if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
908 {
909 return true;
910 }
911 }
912 }
913
914 iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
915 if (iconv_t_valid(cd))
916 {
917 return true;
918 }
919
920 return false;
921}
922
933struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
934{
935 iconv_t cd = ICONV_T_INVALID;
936
937 if (from && to)
938 cd = mutt_ch_iconv_open(to, from, flags);
939
940 struct FgetConv *fc = MUTT_MEM_CALLOC(1, struct FgetConv);
941 fc->fp = fp;
942 fc->cd = cd;
943
944 if (iconv_t_valid(cd))
945 {
946 static const char *repls[] = { "\357\277\275", "?", 0 };
947
948 fc->p = fc->bufo;
949 fc->ob = fc->bufo;
950 fc->ib = fc->bufi;
951 fc->ibl = 0;
952 fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
953 }
954
955 return fc;
956}
957
963{
964 if (!ptr || !*ptr)
965 return;
966
967 FREE(ptr);
968}
969
981{
982 if (!fc)
983 return EOF;
984 if (!iconv_t_valid(fc->cd))
985 return fgetc(fc->fp);
986 if (!fc->p)
987 return EOF;
988 if (fc->p < fc->ob)
989 return (unsigned char) *(fc->p)++;
990
991 /* Try to convert some more */
992 fc->p = fc->bufo;
993 fc->ob = fc->bufo;
994 if (fc->ibl)
995 {
996 size_t obl = sizeof(fc->bufo);
997 iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
998 if (fc->p < fc->ob)
999 return (unsigned char) *(fc->p)++;
1000 }
1001
1002 /* If we trusted iconv a bit more, we would at this point
1003 * ask why it had stopped converting ... */
1004
1005 /* Try to read some more */
1006 if ((fc->ibl == sizeof(fc->bufi)) ||
1007 (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
1008 {
1009 fc->p = 0;
1010 return EOF;
1011 }
1012 if (fc->ibl)
1013 memcpy(fc->bufi, fc->ib, fc->ibl);
1014 fc->ib = fc->bufi;
1015 fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
1016
1017 /* Try harder this time to convert some */
1018 if (fc->ibl)
1019 {
1020 size_t obl = sizeof(fc->bufo);
1021 mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
1022 fc->inrepls, 0, NULL);
1023 if (fc->p < fc->ob)
1024 return (unsigned char) *(fc->p)++;
1025 }
1026
1027 /* Either the file has finished or one of the buffers is too small */
1028 fc->p = 0;
1029 return EOF;
1030}
1031
1042char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
1043{
1044 if (!buf)
1045 return NULL;
1046
1047 size_t r;
1048 for (r = 0; (r + 1) < buflen;)
1049 {
1050 const int c = mutt_ch_fgetconv(fc);
1051 if (c == EOF)
1052 break;
1053 buf[r++] = (char) c;
1054 if (c == '\n')
1055 break;
1056 }
1057 buf[r] = '\0';
1058
1059 if (r > 0)
1060 return buf;
1061
1062 return NULL;
1063}
1064
1075void mutt_ch_set_charset(const char *charset)
1076{
1077 char buf[256] = { 0 };
1078
1079 mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1080
1081 if (mutt_ch_is_utf8(buf))
1082 {
1083 CharsetIsUtf8 = true;
1084 ReplacementChar = 0xfffd; /* replacement character */
1085 }
1086 else
1087 {
1088 CharsetIsUtf8 = false;
1089 ReplacementChar = '?';
1090 }
1091
1092#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1093 bind_textdomain_codeset(PACKAGE, buf);
1094#endif
1095}
1096
1108char *mutt_ch_choose(const char *fromcode, const struct Slist *charsets,
1109 const char *u, size_t ulen, char **d, size_t *dlen)
1110{
1111 if (!fromcode || !charsets)
1112 return NULL;
1113
1114 char *e = NULL, *tocode = NULL;
1115 size_t elen = 0, bestn = 0;
1116
1117 const struct ListNode *np = NULL;
1118 STAILQ_FOREACH(np, &charsets->head, entries)
1119 {
1120 char *t = mutt_str_dup(np->data);
1121 if (!t)
1122 continue;
1123
1124 size_t n = mutt_str_len(t);
1125 char *s = mutt_strn_dup(u, ulen);
1126 const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1127 mutt_ch_check(s, ulen, fromcode, t);
1128 if (rc)
1129 {
1130 FREE(&t);
1131 FREE(&s);
1132 continue;
1133 }
1134 size_t slen = mutt_str_len(s);
1135
1136 if (!tocode || (n < bestn))
1137 {
1138 bestn = n;
1139 FREE(&tocode);
1140 tocode = t;
1141 if (d)
1142 {
1143 FREE(&e);
1144 e = s;
1145 }
1146 else
1147 {
1148 FREE(&s);
1149 }
1150 elen = slen;
1151 }
1152 else
1153 {
1154 FREE(&t);
1155 FREE(&s);
1156 }
1157 }
1158 if (tocode)
1159 {
1160 if (d)
1161 *d = e;
1162 if (dlen)
1163 *dlen = elen;
1164
1165 char canonical_buf[1024] = { 0 };
1166 mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1167 mutt_str_replace(&tocode, canonical_buf);
1168 }
1169 return tocode;
1170}
1171
1176{
1177 for (int i = 0; i < IconvCacheUsed; ++i)
1178 {
1179 FREE(&IconvCache[i].fromcode1);
1180 FREE(&IconvCache[i].tocode1);
1181 if (iconv_t_valid(IconvCache[i].cd))
1182 {
1183 iconv_close(IconvCache[i].cd);
1184 }
1185 }
1186 IconvCacheUsed = 0;
1187}
size_t buf_addch(struct Buffer *buf, char c)
Add a single character to a Buffer.
Definition: buffer.c:241
size_t buf_addstr(struct Buffer *buf, const char *s)
Add a string to a Buffer.
Definition: buffer.c:226
size_t buf_strcpy(struct Buffer *buf, const char *s)
Copy a string into a Buffer.
Definition: buffer.c:395
void buf_lower(struct Buffer *buf)
Sets a buffer to lowercase.
Definition: buffer.c:736
General purpose object for storing and parsing strings.
static const char * buf_string(const struct Buffer *buf)
Convert a buffer to a const char * "string".
Definition: buffer.h:96
#define mutt_debug(LEVEL,...)
Definition: logging2.h:89
Singly-linked list type.
Logging Dispatcher.
@ LL_DEBUG2
Log at debug level 2.
Definition: logging2.h:44
Memory management wrappers.
#define FREE(x)
Definition: memory.h:55
#define MIN(a, b)
Definition: memory.h:32
#define MUTT_MEM_CALLOC(n, type)
Definition: memory.h:40
#define MUTT_MEM_MALLOC(n, type)
Definition: memory.h:41
bool mutt_ch_check_charset(const char *cs, bool strict)
Does iconv understand a character set?
Definition: charset.c:894
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:697
void mutt_ch_lookup_remove(void)
Remove all the character set lookups.
Definition: charset.c:541
static int IconvCacheUsed
Number of iconv descriptors in the cache.
Definition: charset.c:100
char * mutt_ch_choose(const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
Figure the best charset to encode a string.
Definition: charset.c:1108
int mutt_ch_convert_nonmime_string(const struct Slist *const assumed_charset, const char *charset, char **ps)
Try to convert a string using a list of character sets.
Definition: charset.c:331
static struct LookupList Lookups
Lookup table of preferred character set names.
Definition: charset.c:83
char * mutt_ch_get_langinfo_charset(void)
Get the user's choice of character set.
Definition: charset.c:486
static const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:121
bool mutt_ch_lookup_add(enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
Add a new character set lookup.
Definition: charset.c:509
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:374
void mutt_ch_cache_cleanup(void)
Clean up the cached iconv handles and charset strings.
Definition: charset.c:1175
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:781
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:831
void mutt_ch_set_charset(const char *charset)
Update the records for a new character set.
Definition: charset.c:1075
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:66
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:303
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:796
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:562
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:269
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:980
#define ICONV_CACHE_SIZE
Max size of the iconv cache.
Definition: charset.c:96
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:278
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:61
#define EILSEQ
Definition: charset.c:55
struct FgetConv * mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
Prepare a file for charset conversion.
Definition: charset.c:933
static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
Cache of iconv conversion descriptors.
Definition: charset.c:98
char * mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
Convert a file's charset into a string buffer.
Definition: charset.c:1042
bool mutt_ch_chscmp(const char *cs1, const char *cs2)
Are the names of two character sets equivalent?
Definition: charset.c:442
void mutt_ch_fgetconv_close(struct FgetConv **ptr)
Close an fgetconv handle.
Definition: charset.c:962
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:594
const char * mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
Get the default character set.
Definition: charset.c:465
Conversion between different character encodings.
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:65
#define ICONV_T_INVALID
Error value for iconv functions.
Definition: charset.h:93
#define mutt_ch_is_utf8(str)
Definition: charset.h:89
LookupType
Types of character set lookups.
Definition: charset.h:59
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:61
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:60
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:64
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition: charset.h:96
static bool iconv_t_valid(const iconv_t cd)
Is the conversion descriptor valid?
Definition: charset.h:105
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:614
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:380
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:672
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:253
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:299
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:496
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:581
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:242
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:453
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:280
struct Buffer * buf_pool_get(void)
Get a Buffer from the pool.
Definition: pool.c:82
void buf_pool_release(struct Buffer **ptr)
Return a Buffer to the pool.
Definition: pool.c:96
A global pool of Buffers.
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:743
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:753
#define STAILQ_FIRST(head)
Definition: queue.h:350
#define TAILQ_HEAD(name, type)
Definition: queue.h:641
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:827
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:862
#define TAILQ_HEAD_INITIALIZER(head)
Definition: queue.h:655
#define TAILQ_ENTRY(type)
Definition: queue.h:658
Manage regular expressions.
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:50
A separated list of strings.
String manipulation functions.
String manipulation buffer.
Definition: buffer.h:36
size_t dsize
Length of data.
Definition: buffer.h:39
char * data
Pointer to data.
Definition: buffer.h:37
Cursor for converting a file's encoding.
Definition: charset.h:43
char bufi[512]
Definition: charset.h:46
iconv_t cd
iconv conversion descriptor
Definition: charset.h:45
char bufo[512]
Definition: charset.h:47
size_t ibl
Definition: charset.h:51
FILE * fp
Definition: charset.h:44
char * p
Definition: charset.h:48
const char ** inrepls
Definition: charset.h:52
char * ib
Definition: charset.h:50
char * ob
Definition: charset.h:49
Cached iconv conversion descriptor.
Definition: charset.c:89
char * tocode1
Destination character set.
Definition: charset.c:91
char * fromcode1
Source character set.
Definition: charset.c:90
iconv_t cd
iconv conversion descriptor
Definition: charset.c:92
A List node for strings.
Definition: list.h:37
char * data
String.
Definition: list.h:38
Regex to String lookup table.
Definition: charset.c:74
char * replacement
Alternative charset to use.
Definition: charset.c:77
enum LookupType type
Lookup type.
Definition: charset.c:75
struct Regex regex
Regular expression.
Definition: charset.c:76
MIME name lookup entry.
Definition: charset.c:106
const char * key
Definition: charset.c:107
const char * pref
Definition: charset.c:108
Cached regular expression.
Definition: regex3.h:86
char * pattern
printable version
Definition: regex3.h:87
bool pat_not
do not match
Definition: regex3.h:89
regex_t * regex
compiled expression
Definition: regex3.h:88
String list.
Definition: slist.h:37
struct ListHead head
List containing values.
Definition: slist.h:38
size_t count
Number of values in list.
Definition: slist.h:39