NeoMutt  2020-06-26-89-g172cd3
Teaching an old dog new tricks
DOXYGEN
charset.c
Go to the documentation of this file.
1 
29 #include "config.h"
30 #include <ctype.h>
31 #include <errno.h>
32 #include <iconv.h>
33 #include <langinfo.h>
34 #include <limits.h>
35 #include <stdbool.h>
36 #include <stdio.h>
37 #include <string.h>
38 #include "charset.h"
39 #include "buffer.h"
40 #include "memory.h"
41 #include "queue.h"
42 #include "regex3.h"
43 #include "string2.h"
44 #ifdef ENABLE_NLS
45 #include <libintl.h>
46 #endif
47 
48 #ifndef EILSEQ
49 #define EILSEQ EINVAL
50 #endif
51 
53 char *C_Charset;
54 
58 wchar_t ReplacementChar = '?';
59 
63 bool CharsetIsUtf8 = false;
64 
70 struct Lookup
71 {
73  struct Regex regex;
74  char *replacement;
75  TAILQ_ENTRY(Lookup) entries;
76 };
77 TAILQ_HEAD(LookupList, Lookup);
78 
79 static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
80 
84 struct MimeNames
85 {
86  const char *key;
87  const char *pref;
88 };
89 
90 // clang-format off
101 const struct MimeNames PreferredMimeNames[] =
102 {
103  { "ansi_x3.4-1968", "us-ascii" },
104  { "iso-ir-6", "us-ascii" },
105  { "iso_646.irv:1991", "us-ascii" },
106  { "ascii", "us-ascii" },
107  { "iso646-us", "us-ascii" },
108  { "us", "us-ascii" },
109  { "ibm367", "us-ascii" },
110  { "cp367", "us-ascii" },
111  { "csASCII", "us-ascii" },
112 
113  { "csISO2022KR", "iso-2022-kr" },
114  { "csEUCKR", "euc-kr" },
115  { "csISO2022JP", "iso-2022-jp" },
116  { "csISO2022JP2", "iso-2022-jp-2" },
117 
118  { "ISO_8859-1:1987", "iso-8859-1" },
119  { "iso-ir-100", "iso-8859-1" },
120  { "iso_8859-1", "iso-8859-1" },
121  { "latin1", "iso-8859-1" },
122  { "l1", "iso-8859-1" },
123  { "IBM819", "iso-8859-1" },
124  { "CP819", "iso-8859-1" },
125  { "csISOLatin1", "iso-8859-1" },
126 
127  { "ISO_8859-2:1987", "iso-8859-2" },
128  { "iso-ir-101", "iso-8859-2" },
129  { "iso_8859-2", "iso-8859-2" },
130  { "latin2", "iso-8859-2" },
131  { "l2", "iso-8859-2" },
132  { "csISOLatin2", "iso-8859-2" },
133 
134  { "ISO_8859-3:1988", "iso-8859-3" },
135  { "iso-ir-109", "iso-8859-3" },
136  { "ISO_8859-3", "iso-8859-3" },
137  { "latin3", "iso-8859-3" },
138  { "l3", "iso-8859-3" },
139  { "csISOLatin3", "iso-8859-3" },
140 
141  { "ISO_8859-4:1988", "iso-8859-4" },
142  { "iso-ir-110", "iso-8859-4" },
143  { "ISO_8859-4", "iso-8859-4" },
144  { "latin4", "iso-8859-4" },
145  { "l4", "iso-8859-4" },
146  { "csISOLatin4", "iso-8859-4" },
147 
148  { "ISO_8859-6:1987", "iso-8859-6" },
149  { "iso-ir-127", "iso-8859-6" },
150  { "iso_8859-6", "iso-8859-6" },
151  { "ECMA-114", "iso-8859-6" },
152  { "ASMO-708", "iso-8859-6" },
153  { "arabic", "iso-8859-6" },
154  { "csISOLatinArabic", "iso-8859-6" },
155 
156  { "ISO_8859-7:1987", "iso-8859-7" },
157  { "iso-ir-126", "iso-8859-7" },
158  { "ISO_8859-7", "iso-8859-7" },
159  { "ELOT_928", "iso-8859-7" },
160  { "ECMA-118", "iso-8859-7" },
161  { "greek", "iso-8859-7" },
162  { "greek8", "iso-8859-7" },
163  { "csISOLatinGreek", "iso-8859-7" },
164 
165  { "ISO_8859-8:1988", "iso-8859-8" },
166  { "iso-ir-138", "iso-8859-8" },
167  { "ISO_8859-8", "iso-8859-8" },
168  { "hebrew", "iso-8859-8" },
169  { "csISOLatinHebrew", "iso-8859-8" },
170 
171  { "ISO_8859-5:1988", "iso-8859-5" },
172  { "iso-ir-144", "iso-8859-5" },
173  { "ISO_8859-5", "iso-8859-5" },
174  { "cyrillic", "iso-8859-5" },
175  { "csISOLatinCyrillic", "iso-8859-5" },
176 
177  { "ISO_8859-9:1989", "iso-8859-9" },
178  { "iso-ir-148", "iso-8859-9" },
179  { "ISO_8859-9", "iso-8859-9" },
180  { "latin5", "iso-8859-9" }, /* this is not a bug */
181  { "l5", "iso-8859-9" },
182  { "csISOLatin5", "iso-8859-9" },
183 
184  { "ISO_8859-10:1992", "iso-8859-10" },
185  { "iso-ir-157", "iso-8859-10" },
186  { "latin6", "iso-8859-10" }, /* this is not a bug */
187  { "l6", "iso-8859-10" },
188  { "csISOLatin6", "iso-8859-10" },
189 
190  { "csKOI8r", "koi8-r" },
191 
192  { "MS_Kanji", "Shift_JIS" }, /* Note the underscore! */
193  { "csShiftJis", "Shift_JIS" },
194 
195  { "Extended_UNIX_Code_Packed_Format_for_Japanese",
196  "euc-jp" },
197  { "csEUCPkdFmtJapanese", "euc-jp" },
198 
199  { "csGB2312", "gb2312" },
200  { "csbig5", "big5" },
201 
202  /* End of official brain damage.
203  * What follows has been taken from glibc's localedata files. */
204 
205  { "iso_8859-13", "iso-8859-13" },
206  { "iso-ir-179", "iso-8859-13" },
207  { "latin7", "iso-8859-13" }, /* this is not a bug */
208  { "l7", "iso-8859-13" },
209 
210  { "iso_8859-14", "iso-8859-14" },
211  { "latin8", "iso-8859-14" }, /* this is not a bug */
212  { "l8", "iso-8859-14" },
213 
214  { "iso_8859-15", "iso-8859-15" },
215  { "latin9", "iso-8859-15" }, /* this is not a bug */
216 
217  /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
218  { "latin0", "iso-8859-15" }, /* this is not a bug */
219 
220  { "iso_8859-16", "iso-8859-16" },
221  { "latin10", "iso-8859-16" }, /* this is not a bug */
222 
223  { "646", "us-ascii" },
224 
225  /* http://www.sun.com/software/white-papers/wp-unicode/ */
226 
227  { "eucJP", "euc-jp" },
228  { "PCK", "Shift_JIS" },
229  { "ko_KR-euc", "euc-kr" },
230  { "zh_TW-big5", "big5" },
231 
232  /* seems to be common on some systems */
233 
234  { "sjis", "Shift_JIS" },
235  { "euc-jp-ms", "eucJP-ms" },
236 
237  /* If you happen to encounter system-specific brain-damage with respect to
238  * character set naming, please add it above this comment, and submit a patch
239  * to <neomutt-devel@neomutt.org> */
240 
241  { NULL, NULL },
242 };
243 // clang-format on
244 
249 static struct Lookup *lookup_new(void)
250 {
251  return mutt_mem_calloc(1, sizeof(struct Lookup));
252 }
253 
258 static void lookup_free(struct Lookup **ptr)
259 {
260  if (!ptr || !*ptr)
261  return;
262 
263  struct Lookup *l = *ptr;
264  FREE(&l->replacement);
265  FREE(&l->regex.pattern);
266  if (l->regex.regex)
267  regfree(l->regex.regex);
268  FREE(&l->regex.regex);
269  FREE(&l->regex);
270 
271  FREE(ptr);
272 }
273 
283 static const char *lookup_charset(enum LookupType type, const char *cs)
284 {
285  if (!cs)
286  return NULL;
287 
288  struct Lookup *l = NULL;
289 
290  TAILQ_FOREACH(l, &Lookups, entries)
291  {
292  if (l->type != type)
293  continue;
294  if (mutt_regex_match(&l->regex, cs))
295  return l->replacement;
296  }
297  return NULL;
298 }
299 
310 {
311  if (!ps)
312  return -1;
313 
314  char *u = *ps;
315  const size_t ulen = mutt_str_len(u);
316  if (ulen == 0)
317  return 0;
318 
319  const char *c1 = NULL;
320 
321  for (const char *c = C_AssumedCharset; c; c = c1 ? c1 + 1 : 0)
322  {
323  c1 = strchr(c, ':');
324  size_t n = c1 ? c1 - c : mutt_str_len(c);
325  if (n == 0)
326  return 0;
327  char *fromcode = mutt_mem_malloc(n + 1);
328  mutt_str_copy(fromcode, c, n + 1);
329  char *s = mutt_strn_dup(u, ulen);
330  int m = mutt_ch_convert_string(&s, fromcode, C_Charset, 0);
331  FREE(&fromcode);
332  FREE(&s);
333  if (m == 0)
334  {
335  return 0;
336  }
337  }
340  return -1;
341 }
342 
352 void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
353 {
354  if (!buf || !name)
355  return;
356 
357  char in[1024], scratch[1024];
358 
359  mutt_str_copy(in, name, sizeof(in));
360  char *ext = strchr(in, '/');
361  if (ext)
362  *ext++ = '\0';
363 
364  if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
365  {
366  mutt_str_copy(buf, "utf-8", buflen);
367  goto out;
368  }
369 
370  /* catch some common iso-8859-something misspellings */
371  size_t plen;
372  if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
373  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
374  else if ((plen = mutt_istr_startswith(in, "8859-")))
375  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
376  else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
377  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
378  else if ((plen = mutt_istr_startswith(in, "iso8859-")))
379  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
380  else
381  mutt_str_copy(scratch, in, sizeof(scratch));
382 
383  for (size_t i = 0; PreferredMimeNames[i].key; i++)
384  {
385  if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
386  {
387  mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
388  goto out;
389  }
390  }
391 
392  mutt_str_copy(buf, scratch, buflen);
393 
394  /* for cosmetics' sake, transform to lowercase. */
395  for (char *p = buf; *p; p++)
396  *p = tolower(*p);
397 
398 out:
399  if (ext && *ext)
400  {
401  mutt_str_cat(buf, buflen, "/");
402  mutt_str_cat(buf, buflen, ext);
403  }
404 }
405 
418 bool mutt_ch_chscmp(const char *cs1, const char *cs2)
419 {
420  if (!cs1 || !cs2)
421  return false;
422 
423  char buf[256];
424 
425  mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
426 
427  int len1 = mutt_str_len(buf);
428  int len2 = mutt_str_len(cs2);
429 
430  return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
431  ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
432 }
433 
441 {
442  static char fcharset[128];
443  const char *c = C_AssumedCharset;
444  const char *c1 = NULL;
445 
446  if (c)
447  {
448  c1 = strchr(c, ':');
449  mutt_str_copy(fcharset, c, c1 ? (c1 - c + 1) : sizeof(fcharset));
450  return fcharset;
451  }
452  return strcpy(fcharset, "us-ascii");
453 }
454 
463 {
464  char buf[1024] = { 0 };
465 
466  mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
467 
468  if (buf[0] != '\0')
469  return mutt_str_dup(buf);
470 
471  return mutt_str_dup("iso-8859-1");
472 }
473 
485 bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
486  const char *replace, struct Buffer *err)
487 {
488  if (!pat || !replace)
489  return false;
490 
491  regex_t *rx = mutt_mem_malloc(sizeof(regex_t));
492  int rc = REG_COMP(rx, pat, REG_ICASE);
493  if (rc != 0)
494  {
495  regerror(rc, rx, err->data, err->dsize);
496  FREE(&rx);
497  return false;
498  }
499 
500  struct Lookup *l = lookup_new();
501  l->type = type;
502  l->replacement = mutt_str_dup(replace);
503  l->regex.pattern = mutt_str_dup(pat);
504  l->regex.regex = rx;
505  l->regex.pat_not = false;
506 
507  TAILQ_INSERT_TAIL(&Lookups, l, entries);
508 
509  return true;
510 }
511 
518 {
519  struct Lookup *l = NULL;
520  struct Lookup *tmp = NULL;
521 
522  TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
523  {
524  TAILQ_REMOVE(&Lookups, l, entries);
525  lookup_free(&l);
526  }
527 }
528 
538 const char *mutt_ch_charset_lookup(const char *chs)
539 {
540  return lookup_charset(MUTT_LOOKUP_CHARSET, chs);
541 }
542 
565 iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, int flags)
566 {
567  char tocode1[128];
568  char fromcode1[128];
569  const char *tocode2 = NULL, *fromcode2 = NULL;
570  const char *tmp = NULL;
571 
572  iconv_t cd;
573 
574  /* transform to MIME preferred charset names */
575  mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
576  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
577 
578  /* maybe apply charset-hooks and recanonicalise fromcode,
579  * but only when caller asked us to sanitize a potentially wrong
580  * charset name incoming from the wild exterior. */
581  if (flags & MUTT_ICONV_HOOK_FROM)
582  {
583  tmp = mutt_ch_charset_lookup(fromcode1);
584  if (tmp)
585  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
586  }
587 
588  /* always apply iconv-hooks to suit system's iconv tastes */
589  tocode2 = mutt_ch_iconv_lookup(tocode1);
590  tocode2 = tocode2 ? tocode2 : tocode1;
591  fromcode2 = mutt_ch_iconv_lookup(fromcode1);
592  fromcode2 = fromcode2 ? fromcode2 : fromcode1;
593 
594  /* call system iconv with names it appreciates */
595  cd = iconv_open(tocode2, fromcode2);
596  if (cd != (iconv_t) -1)
597  return cd;
598 
599  return (iconv_t) -1;
600 }
601 
618 size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
619  char **outbuf, size_t *outbytesleft, const char **inrepls,
620  const char *outrepl, int *iconverrno)
621 {
622  size_t rc = 0;
623  const char *ib = *inbuf;
624  size_t ibl = *inbytesleft;
625  char *ob = *outbuf;
626  size_t obl = *outbytesleft;
627 
628  while (true)
629  {
630  errno = 0;
631  const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
632  if (ret1 != (size_t) -1)
633  rc += ret1;
634  if (iconverrno)
635  *iconverrno = errno;
636 
637  if (ibl && obl && (errno == EILSEQ))
638  {
639  if (inrepls)
640  {
641  /* Try replacing the input */
642  const char **t = NULL;
643  for (t = inrepls; *t; t++)
644  {
645  const char *ib1 = *t;
646  size_t ibl1 = strlen(*t);
647  char *ob1 = ob;
648  size_t obl1 = obl;
649  iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
650  if (ibl1 == 0)
651  {
652  ib++;
653  ibl--;
654  ob = ob1;
655  obl = obl1;
656  rc++;
657  break;
658  }
659  }
660  if (*t)
661  continue;
662  }
663  /* Replace the output */
664  if (!outrepl)
665  outrepl = "?";
666  iconv(cd, NULL, NULL, &ob, &obl);
667  if (obl)
668  {
669  int n = strlen(outrepl);
670  if (n > obl)
671  {
672  outrepl = "?";
673  n = 1;
674  }
675  memcpy(ob, outrepl, n);
676  ib++;
677  ibl--;
678  ob += n;
679  obl -= n;
680  rc++;
681  iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
682  continue;
683  }
684  }
685  *inbuf = ib;
686  *inbytesleft = ibl;
687  *outbuf = ob;
688  *outbytesleft = obl;
689  return rc;
690  }
691 }
692 
702 const char *mutt_ch_iconv_lookup(const char *chs)
703 {
704  return lookup_charset(MUTT_LOOKUP_ICONV, chs);
705 }
706 
717 int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
718 {
719  if (!s || !from || !to)
720  return -1;
721 
722  int rc = 0;
723  iconv_t cd = mutt_ch_iconv_open(to, from, 0);
724  if (cd == (iconv_t) -1)
725  return -1;
726 
727  size_t outlen = MB_LEN_MAX * slen;
728  char *out = mutt_mem_malloc(outlen + 1);
729  char *saved_out = out;
730 
731  const size_t convlen =
732  iconv(cd, (ICONV_CONST char **) &s, &slen, &out, (size_t *) &outlen);
733  if (convlen == -1)
734  rc = errno;
735 
736  FREE(&saved_out);
737  iconv_close(cd);
738  return rc;
739 }
740 
754 int mutt_ch_convert_string(char **ps, const char *from, const char *to, int flags)
755 {
756  if (!ps)
757  return -1;
758 
759  char *s = *ps;
760 
761  if (!s || (*s == '\0'))
762  return 0;
763 
764  if (!to || !from)
765  return -1;
766 
767  const char *repls[] = { "\357\277\275", "?", 0 };
768  int rc = 0;
769 
770  iconv_t cd = mutt_ch_iconv_open(to, from, flags);
771  if (cd == (iconv_t) -1)
772  return -1;
773 
774  size_t len;
775  const char *ib = NULL;
776  char *buf = NULL, *ob = NULL;
777  size_t ibl, obl;
778  const char **inrepls = NULL;
779  const char *outrepl = NULL;
780 
781  if (mutt_ch_is_utf8(to))
782  outrepl = "\357\277\275";
783  else if (mutt_ch_is_utf8(from))
784  inrepls = repls;
785  else
786  outrepl = "?";
787 
788  len = strlen(s);
789  ib = s;
790  ibl = len + 1;
791  obl = MB_LEN_MAX * ibl;
792  buf = mutt_mem_malloc(obl + 1);
793  ob = buf;
794 
795  mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
796  iconv_close(cd);
797 
798  *ob = '\0';
799 
800  FREE(ps);
801  *ps = buf;
802 
803  mutt_str_adjust(ps);
804  return rc;
805 }
806 
818 bool mutt_ch_check_charset(const char *cs, bool strict)
819 {
820  if (!cs)
821  return false;
822 
823  if (mutt_ch_is_utf8(cs))
824  return true;
825 
826  if (!strict)
827  {
828  for (int i = 0; PreferredMimeNames[i].key; i++)
829  {
830  if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
831  mutt_istr_equal(PreferredMimeNames[i].pref, cs))
832  {
833  return true;
834  }
835  }
836  }
837 
838  iconv_t cd = mutt_ch_iconv_open(cs, cs, 0);
839  if (cd != (iconv_t)(-1))
840  {
841  iconv_close(cd);
842  return true;
843  }
844 
845  return false;
846 }
847 
858 struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, int flags)
859 {
860  struct FgetConv *fc = NULL;
861  iconv_t cd = (iconv_t) -1;
862 
863  if (from && to)
864  cd = mutt_ch_iconv_open(to, from, flags);
865 
866  if (cd != (iconv_t) -1)
867  {
868  static const char *repls[] = { "\357\277\275", "?", 0 };
869 
870  fc = mutt_mem_malloc(sizeof(struct FgetConv));
871  fc->p = fc->bufo;
872  fc->ob = fc->bufo;
873  fc->ib = fc->bufi;
874  fc->ibl = 0;
875  fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
876  }
877  else
878  fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
879  fc->fp = fp;
880  fc->cd = cd;
881  return fc;
882 }
883 
889 {
890  if (!fc || !*fc)
891  return;
892 
893  if ((*fc)->cd != (iconv_t) -1)
894  iconv_close((*fc)->cd);
895  FREE(fc);
896 }
897 
908 int mutt_ch_fgetconv(struct FgetConv *fc)
909 {
910  if (!fc)
911  return EOF;
912  if (fc->cd == (iconv_t) -1)
913  return fgetc(fc->fp);
914  if (!fc->p)
915  return EOF;
916  if (fc->p < fc->ob)
917  return (unsigned char) *(fc->p)++;
918 
919  /* Try to convert some more */
920  fc->p = fc->bufo;
921  fc->ob = fc->bufo;
922  if (fc->ibl)
923  {
924  size_t obl = sizeof(fc->bufo);
925  iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
926  if (fc->p < fc->ob)
927  return (unsigned char) *(fc->p)++;
928  }
929 
930  /* If we trusted iconv a bit more, we would at this point
931  * ask why it had stopped converting ... */
932 
933  /* Try to read some more */
934  if ((fc->ibl == sizeof(fc->bufi)) ||
935  (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
936  {
937  fc->p = 0;
938  return EOF;
939  }
940  if (fc->ibl)
941  memcpy(fc->bufi, fc->ib, fc->ibl);
942  fc->ib = fc->bufi;
943  fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
944 
945  /* Try harder this time to convert some */
946  if (fc->ibl)
947  {
948  size_t obl = sizeof(fc->bufo);
949  mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
950  fc->inrepls, 0, NULL);
951  if (fc->p < fc->ob)
952  return (unsigned char) *(fc->p)++;
953  }
954 
955  /* Either the file has finished or one of the buffers is too small */
956  fc->p = 0;
957  return EOF;
958 }
959 
970 char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
971 {
972  if (!buf)
973  return NULL;
974 
975  size_t r;
976  for (r = 0; (r + 1) < buflen;)
977  {
978  const int c = mutt_ch_fgetconv(fc);
979  if (c == EOF)
980  break;
981  buf[r++] = (char) c;
982  if (c == '\n')
983  break;
984  }
985  buf[r] = '\0';
986 
987  if (r > 0)
988  return buf;
989 
990  return NULL;
991 }
992 
1003 void mutt_ch_set_charset(const char *charset)
1004 {
1005  char buf[256];
1006 
1007  mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1008 
1009  if (mutt_ch_is_utf8(buf))
1010  {
1011  CharsetIsUtf8 = true;
1012  ReplacementChar = 0xfffd; /* replacement character */
1013  }
1014  else
1015  {
1016  CharsetIsUtf8 = false;
1017  ReplacementChar = '?';
1018  }
1019 
1020 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1021  bind_textdomain_codeset(PACKAGE, buf);
1022 #endif
1023 }
1024 
1036 char *mutt_ch_choose(const char *fromcode, const char *charsets, const char *u,
1037  size_t ulen, char **d, size_t *dlen)
1038 {
1039  if (!fromcode)
1040  return NULL;
1041 
1042  char *e = NULL, *tocode = NULL;
1043  size_t elen = 0, bestn = 0;
1044  const char *q = NULL;
1045 
1046  for (const char *p = charsets; p; p = q ? q + 1 : 0)
1047  {
1048  q = strchr(p, ':');
1049 
1050  size_t n = q ? q - p : strlen(p);
1051  if (n == 0)
1052  continue;
1053 
1054  char *t = mutt_mem_malloc(n + 1);
1055  memcpy(t, p, n);
1056  t[n] = '\0';
1057 
1058  char *s = mutt_strn_dup(u, ulen);
1059  const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, 0) :
1060  mutt_ch_check(s, ulen, fromcode, t);
1061  if (rc)
1062  {
1063  FREE(&t);
1064  FREE(&s);
1065  continue;
1066  }
1067  size_t slen = mutt_str_len(s);
1068 
1069  if (!tocode || (n < bestn))
1070  {
1071  bestn = n;
1072  FREE(&tocode);
1073  tocode = t;
1074  if (d)
1075  {
1076  FREE(&e);
1077  e = s;
1078  }
1079  else
1080  FREE(&s);
1081  elen = slen;
1082  }
1083  else
1084  {
1085  FREE(&t);
1086  FREE(&s);
1087  }
1088  }
1089  if (tocode)
1090  {
1091  if (d)
1092  *d = e;
1093  if (dlen)
1094  *dlen = elen;
1095 
1096  char canonical_buf[1024];
1097  mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1098  mutt_str_replace(&tocode, canonical_buf);
1099  }
1100  return tocode;
1101 }
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:249
char * C_AssumedCharset
Config: If a message is missing a character set, assume this character set.
Definition: charset.c:52
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
Character set conversion.
Definition: charset.h:69
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
int mutt_ch_convert_string(char **ps, const char *from, const char *to, int flags)
Convert a string between encodings.
Definition: charset.c:754
#define MIN(a, b)
Definition: memory.h:31
Memory management wrappers.
char * mutt_ch_choose(const char *fromcode, const char *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
Figure the best charset to encode a string.
Definition: charset.c:1036
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:718
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file&#39;s character set.
Definition: charset.c:908
const char * key
Definition: charset.c:86
char bufi[512]
Definition: charset.h:45
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:283
static size_t plen
Length of cached packet.
Definition: pgppacket.c:39
regex_t * regex
compiled expression
Definition: regex3.h:91
Conversion between different character encodings.
String manipulation buffer.
Definition: buffer.h:33
bool pat_not
do not match
Definition: regex3.h:92
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:728
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:375
char * replacement
Alternative charset to use.
Definition: charset.c:74
bool mutt_ch_check_charset(const char *cs, bool strict)
Does iconv understand a character set?
Definition: charset.c:818
struct Regex regex
Regular expression.
Definition: charset.c:73
void mutt_str_adjust(char **p)
Shrink-to-fit a string.
Definition: string.c:495
void mutt_ch_set_charset(const char *charset)
Update the records for a new character set.
Definition: charset.c:1003
bool mutt_ch_lookup_add(enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
Add a new character set lookup.
Definition: charset.c:485
String manipulation functions.
wchar_t ReplacementChar
When a Unicode character can&#39;t be displayed, use this instead.
Definition: charset.c:58
bool mutt_istrn_equal(const char *a, const char *b, size_t l)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:626
Regex to String lookup table.
Definition: charset.c:70
size_t dsize
Length of data.
Definition: buffer.h:37
bool mutt_ch_chscmp(const char *cs1, const char *cs2)
Are the names of two character sets equivalent?
Definition: charset.c:418
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:553
size_t ibl
Definition: charset.h:50
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:702
LookupType
Types of character set lookups.
Definition: charset.h:66
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:53
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:352
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:258
static char * chs
Definition: gnupgparse.c:73
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:888
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:834
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:618
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:717
A dummy converter.
Definition: charset.h:57
FILE * fp
Definition: charset.h:43
iconv_t cd
Definition: charset.h:44
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:538
char * data
Pointer to data.
Definition: buffer.h:35
static struct LookupList Lookups
Definition: charset.c:79
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, int flags)
Set up iconv for conversions.
Definition: charset.c:565
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:802
char * ib
Definition: charset.h:49
char * mutt_ch_get_langinfo_charset(void)
Get the user&#39;s choice of character set.
Definition: charset.c:462
char * p
Definition: charset.h:47
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:177
void mutt_ch_lookup_remove(void)
Remove all the character set lookups.
Definition: charset.c:517
MIME name lookup entry.
Definition: charset.c:84
char bufo[512]
Definition: charset.h:46
enum LookupType type
Lookup type.
Definition: charset.c:72
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:390
#define TAILQ_ENTRY(type)
Definition: queue.h:633
void mutt_ch_fgetconv_close(struct FgetConv **fc)
Close an fgetconv handle.
Definition: charset.c:888
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:636
General purpose object for storing and parsing strings.
const char * pref
Definition: charset.c:87
bool CharsetIsUtf8
Is the user&#39;s current character set utf-8?
Definition: charset.c:63
int n
Definition: acutest.h:492
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:721
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:451
int mutt_ch_convert_nonmime_string(char **ps)
Try to convert a string using a list of character sets.
Definition: charset.c:309
Cached regular expression.
Definition: regex3.h:88
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:609
#define FREE(x)
Definition: memory.h:40
char * mutt_ch_get_default_charset(void)
Get the default character set.
Definition: charset.c:440
char * mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
Convert a file&#39;s charset into a string buffer.
Definition: charset.c:970
#define EILSEQ
Definition: charset.c:49
Manage regular expressions.
char * C_Charset
Config: Default character set for displaying text on screen.
Definition: charset.c:53
char * pattern
printable version
Definition: regex3.h:90
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
struct FgetConv * mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, int flags)
Prepare a file for charset conversion.
Definition: charset.c:858
#define TAILQ_HEAD_INITIALIZER(head)
Definition: queue.h:630
TAILQ_HEAD(LookupList, Lookup)
Cursor for converting a file&#39;s encoding.
Definition: charset.h:41
Alias for another character set.
Definition: charset.h:68
char * ob
Definition: charset.h:48
const char ** inrepls
Definition: charset.h:51