NeoMutt  2019-11-11
Teaching an old dog new tricks
DOXYGEN
charset.c
Go to the documentation of this file.
1 
29 #include "config.h"
30 #include <ctype.h>
31 #include <errno.h>
32 #include <iconv.h>
33 #include <langinfo.h>
34 #include <limits.h>
35 #include <regex.h>
36 #include <stdbool.h>
37 #include <stdio.h>
38 #include <string.h>
39 #include "charset.h"
40 #include "buffer.h"
41 #include "memory.h"
42 #include "queue.h"
43 #include "regex3.h"
44 #include "string2.h"
45 #ifdef ENABLE_NLS
46 #include <libintl.h>
47 #endif
48 
49 #ifndef EILSEQ
50 #define EILSEQ EINVAL
51 #endif
52 
54 char *C_Charset;
55 
59 wchar_t ReplacementChar = '?';
60 
64 bool CharsetIsUtf8 = false;
65 
71 struct Lookup
72 {
74  struct Regex regex;
75  char *replacement;
76  TAILQ_ENTRY(Lookup) entries;
77 };
78 TAILQ_HEAD(LookupList, Lookup);
79 
80 static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
81 
82 // clang-format off
93 const struct MimeNames PreferredMimeNames[] =
94 {
95  { "ansi_x3.4-1968", "us-ascii" },
96  { "iso-ir-6", "us-ascii" },
97  { "iso_646.irv:1991", "us-ascii" },
98  { "ascii", "us-ascii" },
99  { "iso646-us", "us-ascii" },
100  { "us", "us-ascii" },
101  { "ibm367", "us-ascii" },
102  { "cp367", "us-ascii" },
103  { "csASCII", "us-ascii" },
104 
105  { "csISO2022KR", "iso-2022-kr" },
106  { "csEUCKR", "euc-kr" },
107  { "csISO2022JP", "iso-2022-jp" },
108  { "csISO2022JP2", "iso-2022-jp-2" },
109 
110  { "ISO_8859-1:1987", "iso-8859-1" },
111  { "iso-ir-100", "iso-8859-1" },
112  { "iso_8859-1", "iso-8859-1" },
113  { "latin1", "iso-8859-1" },
114  { "l1", "iso-8859-1" },
115  { "IBM819", "iso-8859-1" },
116  { "CP819", "iso-8859-1" },
117  { "csISOLatin1", "iso-8859-1" },
118 
119  { "ISO_8859-2:1987", "iso-8859-2" },
120  { "iso-ir-101", "iso-8859-2" },
121  { "iso_8859-2", "iso-8859-2" },
122  { "latin2", "iso-8859-2" },
123  { "l2", "iso-8859-2" },
124  { "csISOLatin2", "iso-8859-2" },
125 
126  { "ISO_8859-3:1988", "iso-8859-3" },
127  { "iso-ir-109", "iso-8859-3" },
128  { "ISO_8859-3", "iso-8859-3" },
129  { "latin3", "iso-8859-3" },
130  { "l3", "iso-8859-3" },
131  { "csISOLatin3", "iso-8859-3" },
132 
133  { "ISO_8859-4:1988", "iso-8859-4" },
134  { "iso-ir-110", "iso-8859-4" },
135  { "ISO_8859-4", "iso-8859-4" },
136  { "latin4", "iso-8859-4" },
137  { "l4", "iso-8859-4" },
138  { "csISOLatin4", "iso-8859-4" },
139 
140  { "ISO_8859-6:1987", "iso-8859-6" },
141  { "iso-ir-127", "iso-8859-6" },
142  { "iso_8859-6", "iso-8859-6" },
143  { "ECMA-114", "iso-8859-6" },
144  { "ASMO-708", "iso-8859-6" },
145  { "arabic", "iso-8859-6" },
146  { "csISOLatinArabic", "iso-8859-6" },
147 
148  { "ISO_8859-7:1987", "iso-8859-7" },
149  { "iso-ir-126", "iso-8859-7" },
150  { "ISO_8859-7", "iso-8859-7" },
151  { "ELOT_928", "iso-8859-7" },
152  { "ECMA-118", "iso-8859-7" },
153  { "greek", "iso-8859-7" },
154  { "greek8", "iso-8859-7" },
155  { "csISOLatinGreek", "iso-8859-7" },
156 
157  { "ISO_8859-8:1988", "iso-8859-8" },
158  { "iso-ir-138", "iso-8859-8" },
159  { "ISO_8859-8", "iso-8859-8" },
160  { "hebrew", "iso-8859-8" },
161  { "csISOLatinHebrew", "iso-8859-8" },
162 
163  { "ISO_8859-5:1988", "iso-8859-5" },
164  { "iso-ir-144", "iso-8859-5" },
165  { "ISO_8859-5", "iso-8859-5" },
166  { "cyrillic", "iso-8859-5" },
167  { "csISOLatinCyrillic", "iso-8859-5" },
168 
169  { "ISO_8859-9:1989", "iso-8859-9" },
170  { "iso-ir-148", "iso-8859-9" },
171  { "ISO_8859-9", "iso-8859-9" },
172  { "latin5", "iso-8859-9" }, /* this is not a bug */
173  { "l5", "iso-8859-9" },
174  { "csISOLatin5", "iso-8859-9" },
175 
176  { "ISO_8859-10:1992", "iso-8859-10" },
177  { "iso-ir-157", "iso-8859-10" },
178  { "latin6", "iso-8859-10" }, /* this is not a bug */
179  { "l6", "iso-8859-10" },
180  { "csISOLatin6", "iso-8859-10" },
181 
182  { "csKOI8r", "koi8-r" },
183 
184  { "MS_Kanji", "Shift_JIS" }, /* Note the underscore! */
185  { "csShiftJis", "Shift_JIS" },
186 
187  { "Extended_UNIX_Code_Packed_Format_for_Japanese",
188  "euc-jp" },
189  { "csEUCPkdFmtJapanese", "euc-jp" },
190 
191  { "csGB2312", "gb2312" },
192  { "csbig5", "big5" },
193 
194  /* End of official brain damage.
195  * What follows has been taken from glibc's localedata files. */
196 
197  { "iso_8859-13", "iso-8859-13" },
198  { "iso-ir-179", "iso-8859-13" },
199  { "latin7", "iso-8859-13" }, /* this is not a bug */
200  { "l7", "iso-8859-13" },
201 
202  { "iso_8859-14", "iso-8859-14" },
203  { "latin8", "iso-8859-14" }, /* this is not a bug */
204  { "l8", "iso-8859-14" },
205 
206  { "iso_8859-15", "iso-8859-15" },
207  { "latin9", "iso-8859-15" }, /* this is not a bug */
208 
209  /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
210  { "latin0", "iso-8859-15" }, /* this is not a bug */
211 
212  { "iso_8859-16", "iso-8859-16" },
213  { "latin10", "iso-8859-16" }, /* this is not a bug */
214 
215  { "646", "us-ascii" },
216 
217  /* http://www.sun.com/software/white-papers/wp-unicode/ */
218 
219  { "eucJP", "euc-jp" },
220  { "PCK", "Shift_JIS" },
221  { "ko_KR-euc", "euc-kr" },
222  { "zh_TW-big5", "big5" },
223 
224  /* seems to be common on some systems */
225 
226  { "sjis", "Shift_JIS" },
227  { "euc-jp-ms", "eucJP-ms" },
228 
229  /* If you happen to encounter system-specific brain-damage with respect to
230  * character set naming, please add it above this comment, and submit a patch
231  * to <neomutt-devel@neomutt.org> */
232 
233  { NULL, NULL },
234 };
235 // clang-format on
236 
241 struct Lookup *lookup_new(void)
242 {
243  return mutt_mem_calloc(1, sizeof(struct Lookup));
244 }
245 
250 void lookup_free(struct Lookup **ptr)
251 {
252  if (!ptr || !*ptr)
253  return;
254 
255  struct Lookup *l = *ptr;
256  FREE(&l->replacement);
257  FREE(&l->regex.pattern);
258  if (l->regex.regex)
259  regfree(l->regex.regex);
260  FREE(&l->regex.regex);
261  FREE(&l->regex);
262 
263  FREE(ptr);
264 }
265 
275 static const char *lookup_charset(enum LookupType type, const char *cs)
276 {
277  if (!cs)
278  return NULL;
279 
280  struct Lookup *l = NULL;
281 
282  TAILQ_FOREACH(l, &Lookups, entries)
283  {
284  if (l->type != type)
285  continue;
286  if (mutt_regex_match(&l->regex, cs))
287  return l->replacement;
288  }
289  return NULL;
290 }
291 
302 {
303  if (!ps)
304  return -1;
305 
306  const char *c1 = NULL;
307 
308  for (const char *c = C_AssumedCharset; c; c = c1 ? c1 + 1 : 0)
309  {
310  char *u = *ps;
311  size_t ulen = mutt_str_strlen(*ps);
312 
313  if (!u || !*u)
314  return 0;
315 
316  c1 = strchr(c, ':');
317  size_t n = c1 ? c1 - c : mutt_str_strlen(c);
318  if (n == 0)
319  return 0;
320  char *fromcode = mutt_mem_malloc(n + 1);
321  mutt_str_strfcpy(fromcode, c, n + 1);
322  char *s = mutt_str_substr_dup(u, u + ulen);
323  int m = mutt_ch_convert_string(&s, fromcode, C_Charset, 0);
324  FREE(&fromcode);
325  FREE(&s);
326  if (m == 0)
327  {
328  return 0;
329  }
330  }
333  return -1;
334 }
335 
345 void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
346 {
347  if (!buf || !name)
348  return;
349 
350  char in[1024], scratch[1024];
351 
352  mutt_str_strfcpy(in, name, sizeof(in));
353  char *ext = strchr(in, '/');
354  if (ext)
355  *ext++ = '\0';
356 
357  if ((mutt_str_strcasecmp(in, "utf-8") == 0) ||
358  (mutt_str_strcasecmp(in, "utf8") == 0))
359  {
360  mutt_str_strfcpy(buf, "utf-8", buflen);
361  goto out;
362  }
363 
364  /* catch some common iso-8859-something misspellings */
365  size_t plen;
366  if ((plen = mutt_str_startswith(in, "8859", CASE_IGNORE)) && (in[plen] != '-'))
367  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
368  else if ((plen = mutt_str_startswith(in, "8859-", CASE_IGNORE)))
369  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
370  else if ((plen = mutt_str_startswith(in, "iso8859", CASE_IGNORE)) && (in[plen] != '-'))
371  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
372  else if ((plen = mutt_str_startswith(in, "iso8859-", CASE_IGNORE)))
373  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
374  else
375  mutt_str_strfcpy(scratch, in, sizeof(scratch));
376 
377  for (size_t i = 0; PreferredMimeNames[i].key; i++)
378  {
379  if (mutt_str_strcasecmp(scratch, PreferredMimeNames[i].key) == 0)
380  {
381  mutt_str_strfcpy(buf, PreferredMimeNames[i].pref, buflen);
382  goto out;
383  }
384  }
385 
386  mutt_str_strfcpy(buf, scratch, buflen);
387 
388  /* for cosmetics' sake, transform to lowercase. */
389  for (char *p = buf; *p; p++)
390  *p = tolower(*p);
391 
392 out:
393  if (ext && *ext)
394  {
395  mutt_str_strcat(buf, buflen, "/");
396  mutt_str_strcat(buf, buflen, ext);
397  }
398 }
399 
412 bool mutt_ch_chscmp(const char *cs1, const char *cs2)
413 {
414  if (!cs1 || !cs2)
415  return false;
416 
417  char buf[256];
418 
419  mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
420 
421  int len1 = mutt_str_strlen(buf);
422  int len2 = mutt_str_strlen(cs2);
423 
424  return mutt_str_strncasecmp(((len1 > len2) ? buf : cs2),
425  ((len1 > len2) ? cs2 : buf), MIN(len1, len2)) == 0;
426 }
427 
435 {
436  static char fcharset[128];
437  const char *c = C_AssumedCharset;
438  const char *c1 = NULL;
439 
440  if (c)
441  {
442  c1 = strchr(c, ':');
443  mutt_str_strfcpy(fcharset, c, c1 ? (c1 - c + 1) : sizeof(fcharset));
444  return fcharset;
445  }
446  return strcpy(fcharset, "us-ascii");
447 }
448 
457 {
458  char buf[1024] = { 0 };
459 
460  mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
461 
462  if (buf[0] != '\0')
463  return mutt_str_strdup(buf);
464 
465  return mutt_str_strdup("iso-8859-1");
466 }
467 
479 bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
480  const char *replace, struct Buffer *err)
481 {
482  if (!pat || !replace)
483  return false;
484 
485  regex_t *rx = mutt_mem_malloc(sizeof(regex_t));
486  int rc = REG_COMP(rx, pat, REG_ICASE);
487  if (rc != 0)
488  {
489  regerror(rc, rx, err->data, err->dsize);
490  FREE(&rx);
491  return false;
492  }
493 
494  struct Lookup *l = lookup_new();
495  l->type = type;
496  l->replacement = mutt_str_strdup(replace);
497  l->regex.pattern = mutt_str_strdup(pat);
498  l->regex.regex = rx;
499  l->regex.pat_not = false;
500 
501  TAILQ_INSERT_TAIL(&Lookups, l, entries);
502 
503  return true;
504 }
505 
512 {
513  struct Lookup *l = NULL;
514  struct Lookup *tmp = NULL;
515 
516  TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
517  {
518  TAILQ_REMOVE(&Lookups, l, entries);
519  lookup_free(&l);
520  }
521 }
522 
532 const char *mutt_ch_charset_lookup(const char *chs)
533 {
534  return lookup_charset(MUTT_LOOKUP_CHARSET, chs);
535 }
536 
559 iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, int flags)
560 {
561  char tocode1[128];
562  char fromcode1[128];
563  const char *tocode2 = NULL, *fromcode2 = NULL;
564  const char *tmp = NULL;
565 
566  iconv_t cd;
567 
568  /* transform to MIME preferred charset names */
569  mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
570  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
571 
572  /* maybe apply charset-hooks and recanonicalise fromcode,
573  * but only when caller asked us to sanitize a potentially wrong
574  * charset name incoming from the wild exterior. */
575  if (flags & MUTT_ICONV_HOOK_FROM)
576  {
577  tmp = mutt_ch_charset_lookup(fromcode1);
578  if (tmp)
579  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
580  }
581 
582  /* always apply iconv-hooks to suit system's iconv tastes */
583  tocode2 = mutt_ch_iconv_lookup(tocode1);
584  tocode2 = tocode2 ? tocode2 : tocode1;
585  fromcode2 = mutt_ch_iconv_lookup(fromcode1);
586  fromcode2 = fromcode2 ? fromcode2 : fromcode1;
587 
588  /* call system iconv with names it appreciates */
589  cd = iconv_open(tocode2, fromcode2);
590  if (cd != (iconv_t) -1)
591  return cd;
592 
593  return (iconv_t) -1;
594 }
595 
612 size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
613  char **outbuf, size_t *outbytesleft, const char **inrepls,
614  const char *outrepl, int *iconverrno)
615 {
616  size_t rc = 0;
617  const char *ib = *inbuf;
618  size_t ibl = *inbytesleft;
619  char *ob = *outbuf;
620  size_t obl = *outbytesleft;
621 
622  while (true)
623  {
624  errno = 0;
625  const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
626  if (ret1 != (size_t) -1)
627  rc += ret1;
628  if (iconverrno)
629  *iconverrno = errno;
630 
631  if (ibl && obl && (errno == EILSEQ))
632  {
633  if (inrepls)
634  {
635  /* Try replacing the input */
636  const char **t = NULL;
637  for (t = inrepls; *t; t++)
638  {
639  const char *ib1 = *t;
640  size_t ibl1 = strlen(*t);
641  char *ob1 = ob;
642  size_t obl1 = obl;
643  iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
644  if (ibl1 == 0)
645  {
646  ib++;
647  ibl--;
648  ob = ob1;
649  obl = obl1;
650  rc++;
651  break;
652  }
653  }
654  if (*t)
655  continue;
656  }
657  /* Replace the output */
658  if (!outrepl)
659  outrepl = "?";
660  iconv(cd, NULL, NULL, &ob, &obl);
661  if (obl)
662  {
663  int n = strlen(outrepl);
664  if (n > obl)
665  {
666  outrepl = "?";
667  n = 1;
668  }
669  memcpy(ob, outrepl, n);
670  ib++;
671  ibl--;
672  ob += n;
673  obl -= n;
674  rc++;
675  iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
676  continue;
677  }
678  }
679  *inbuf = ib;
680  *inbytesleft = ibl;
681  *outbuf = ob;
682  *outbytesleft = obl;
683  return rc;
684  }
685 }
686 
696 const char *mutt_ch_iconv_lookup(const char *chs)
697 {
698  return lookup_charset(MUTT_LOOKUP_ICONV, chs);
699 }
700 
711 int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
712 {
713  if (!s || !from || !to)
714  return -1;
715 
716  int rc = 0;
717  iconv_t cd = mutt_ch_iconv_open(to, from, 0);
718  if (cd == (iconv_t) -1)
719  return -1;
720 
721  size_t outlen = MB_LEN_MAX * slen;
722  char *out = mutt_mem_malloc(outlen + 1);
723  char *saved_out = out;
724 
725  const size_t convlen =
726  iconv(cd, (ICONV_CONST char **) &s, &slen, &out, (size_t *) &outlen);
727  if (convlen == -1)
728  rc = errno;
729 
730  FREE(&saved_out);
731  iconv_close(cd);
732  return rc;
733 }
734 
748 int mutt_ch_convert_string(char **ps, const char *from, const char *to, int flags)
749 {
750  if (!ps)
751  return -1;
752 
753  char *s = *ps;
754 
755  if (!s || !*s)
756  return 0;
757 
758  if (!to || !from)
759  return -1;
760 
761  const char *repls[] = { "\357\277\275", "?", 0 };
762  int rc = 0;
763 
764  iconv_t cd = mutt_ch_iconv_open(to, from, flags);
765  if (cd == (iconv_t) -1)
766  return -1;
767 
768  size_t len;
769  const char *ib = NULL;
770  char *buf = NULL, *ob = NULL;
771  size_t ibl, obl;
772  const char **inrepls = NULL;
773  const char *outrepl = NULL;
774 
775  if (mutt_ch_is_utf8(to))
776  outrepl = "\357\277\275";
777  else if (mutt_ch_is_utf8(from))
778  inrepls = repls;
779  else
780  outrepl = "?";
781 
782  len = strlen(s);
783  ib = s;
784  ibl = len + 1;
785  obl = MB_LEN_MAX * ibl;
786  buf = mutt_mem_malloc(obl + 1);
787  ob = buf;
788 
789  mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
790  iconv_close(cd);
791 
792  *ob = '\0';
793 
794  FREE(ps);
795  *ps = buf;
796 
797  mutt_str_adjust(ps);
798  return rc;
799 }
800 
812 bool mutt_ch_check_charset(const char *cs, bool strict)
813 {
814  if (!cs)
815  return false;
816 
817  if (mutt_ch_is_utf8(cs))
818  return true;
819 
820  if (!strict)
821  {
822  for (int i = 0; PreferredMimeNames[i].key; i++)
823  {
824  if ((mutt_str_strcasecmp(PreferredMimeNames[i].key, cs) == 0) ||
825  (mutt_str_strcasecmp(PreferredMimeNames[i].pref, cs) == 0))
826  {
827  return true;
828  }
829  }
830  }
831 
832  iconv_t cd = mutt_ch_iconv_open(cs, cs, 0);
833  if (cd != (iconv_t)(-1))
834  {
835  iconv_close(cd);
836  return true;
837  }
838 
839  return false;
840 }
841 
852 struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, int flags)
853 {
854  struct FgetConv *fc = NULL;
855  iconv_t cd = (iconv_t) -1;
856 
857  if (from && to)
858  cd = mutt_ch_iconv_open(to, from, flags);
859 
860  if (cd != (iconv_t) -1)
861  {
862  static const char *repls[] = { "\357\277\275", "?", 0 };
863 
864  fc = mutt_mem_malloc(sizeof(struct FgetConv));
865  fc->p = fc->bufo;
866  fc->ob = fc->bufo;
867  fc->ib = fc->bufi;
868  fc->ibl = 0;
869  fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
870  }
871  else
872  fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
873  fc->fp = fp;
874  fc->cd = cd;
875  return fc;
876 }
877 
883 {
884  if (!fc || !*fc)
885  return;
886 
887  if ((*fc)->cd != (iconv_t) -1)
888  iconv_close((*fc)->cd);
889  FREE(fc);
890 }
891 
902 int mutt_ch_fgetconv(struct FgetConv *fc)
903 {
904  if (!fc)
905  return EOF;
906  if (fc->cd == (iconv_t) -1)
907  return fgetc(fc->fp);
908  if (!fc->p)
909  return EOF;
910  if (fc->p < fc->ob)
911  return (unsigned char) *(fc->p)++;
912 
913  /* Try to convert some more */
914  fc->p = fc->bufo;
915  fc->ob = fc->bufo;
916  if (fc->ibl)
917  {
918  size_t obl = sizeof(fc->bufo);
919  iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
920  if (fc->p < fc->ob)
921  return (unsigned char) *(fc->p)++;
922  }
923 
924  /* If we trusted iconv a bit more, we would at this point
925  * ask why it had stopped converting ... */
926 
927  /* Try to read some more */
928  if ((fc->ibl == sizeof(fc->bufi)) ||
929  (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
930  {
931  fc->p = 0;
932  return EOF;
933  }
934  if (fc->ibl)
935  memcpy(fc->bufi, fc->ib, fc->ibl);
936  fc->ib = fc->bufi;
937  fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
938 
939  /* Try harder this time to convert some */
940  if (fc->ibl)
941  {
942  size_t obl = sizeof(fc->bufo);
943  mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
944  fc->inrepls, 0, NULL);
945  if (fc->p < fc->ob)
946  return (unsigned char) *(fc->p)++;
947  }
948 
949  /* Either the file has finished or one of the buffers is too small */
950  fc->p = 0;
951  return EOF;
952 }
953 
964 char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
965 {
966  if (!buf)
967  return NULL;
968 
969  size_t r;
970  for (r = 0; (r + 1) < buflen;)
971  {
972  const int c = mutt_ch_fgetconv(fc);
973  if (c == EOF)
974  break;
975  buf[r++] = (char) c;
976  if (c == '\n')
977  break;
978  }
979  buf[r] = '\0';
980 
981  if (r > 0)
982  return buf;
983 
984  return NULL;
985 }
986 
997 void mutt_ch_set_charset(const char *charset)
998 {
999  char buf[256];
1000 
1001  mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1002 
1003  if (mutt_ch_is_utf8(buf))
1004  {
1005  CharsetIsUtf8 = true;
1006  ReplacementChar = 0xfffd; /* replacement character */
1007  }
1008  else
1009  {
1010  CharsetIsUtf8 = false;
1011  ReplacementChar = '?';
1012  }
1013 
1014 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1015  bind_textdomain_codeset(PACKAGE, buf);
1016 #endif
1017 }
1018 
1030 char *mutt_ch_choose(const char *fromcode, const char *charsets, const char *u,
1031  size_t ulen, char **d, size_t *dlen)
1032 {
1033  if (!fromcode)
1034  return NULL;
1035 
1036  char *e = NULL, *tocode = NULL;
1037  size_t elen = 0, bestn = 0;
1038  const char *q = NULL;
1039 
1040  for (const char *p = charsets; p; p = q ? q + 1 : 0)
1041  {
1042  q = strchr(p, ':');
1043 
1044  size_t n = q ? q - p : strlen(p);
1045  if (n == 0)
1046  continue;
1047 
1048  char *t = mutt_mem_malloc(n + 1);
1049  memcpy(t, p, n);
1050  t[n] = '\0';
1051 
1052  char *s = mutt_str_substr_dup(u, u + ulen);
1053  const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, 0) :
1054  mutt_ch_check(s, ulen, fromcode, t);
1055  if (rc)
1056  {
1057  FREE(&t);
1058  FREE(&s);
1059  continue;
1060  }
1061  size_t slen = mutt_str_strlen(s);
1062 
1063  if (!tocode || (n < bestn))
1064  {
1065  bestn = n;
1066  FREE(&tocode);
1067  tocode = t;
1068  if (d)
1069  {
1070  FREE(&e);
1071  e = s;
1072  }
1073  else
1074  FREE(&s);
1075  elen = slen;
1076  }
1077  else
1078  {
1079  FREE(&t);
1080  FREE(&s);
1081  }
1082  }
1083  if (tocode)
1084  {
1085  if (d)
1086  *d = e;
1087  if (dlen)
1088  *dlen = elen;
1089 
1090  char canonical_buf[1024];
1091  mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1092  mutt_str_replace(&tocode, canonical_buf);
1093  }
1094  return tocode;
1095 }
char * C_AssumedCharset
Config: If a message is missing a character set, assume this character set.
Definition: charset.c:53
#define mutt_ch_is_utf8(str)
Definition: charset.h:106
Character set conversion.
Definition: charset.h:78
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
int mutt_ch_convert_string(char **ps, const char *from, const char *to, int flags)
Convert a string between encodings.
Definition: charset.c:748
#define MIN(a, b)
Definition: memory.h:31
Memory management wrappers.
char * mutt_ch_choose(const char *fromcode, const char *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
Figure the best charset to encode a string.
Definition: charset.c:1030
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:719
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file&#39;s character set.
Definition: charset.c:902
const char * key
Definition: charset.h:68
char bufi[512]
Definition: charset.h:45
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:275
static size_t plen
Length of cached packet.
Definition: pgppacket.c:38
regex_t * regex
compiled expression
Definition: regex3.h:60
Conversion between different character encodings.
String manipulation buffer.
Definition: buffer.h:33
bool pat_not
do not match
Definition: regex3.h:61
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:729
char * replacement
Alternative charset to use.
Definition: charset.c:75
bool mutt_ch_check_charset(const char *cs, bool strict)
Does iconv understand a character set?
Definition: charset.c:812
struct Regex regex
Regular expression.
Definition: charset.c:74
void mutt_str_adjust(char **p)
Shrink-to-fit a string.
Definition: string.c:495
size_t mutt_str_strlen(const char *a)
Calculate the length of a string, safely.
Definition: string.c:666
void mutt_ch_set_charset(const char *charset)
Update the records for a new character set.
Definition: charset.c:997
bool mutt_ch_lookup_add(enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
Add a new character set lookup.
Definition: charset.c:479
String manipulation functions.
wchar_t ReplacementChar
When a Unicode character can&#39;t be displayed, use this instead.
Definition: charset.c:59
Regex to String lookup table.
Definition: charset.c:71
size_t dsize
Length of data.
Definition: buffer.h:37
bool mutt_ch_chscmp(const char *cs1, const char *cs2)
Are the names of two character sets equivalent?
Definition: charset.c:412
void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:250
size_t ibl
Definition: charset.h:50
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:696
const char * name
Definition: pgpmicalg.c:45
LookupType
Types of character set lookups.
Definition: charset.h:75
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:52
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:345
static char * chs
Definition: gnupgparse.c:72
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:821
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:612
int mutt_str_strncasecmp(const char *a, const char *b, size_t l)
Compare two strings ignoring case (to a maximum), safely.
Definition: string.c:656
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:711
A dummy converter.
Definition: charset.h:57
FILE * fp
Definition: charset.h:43
iconv_t cd
Definition: charset.h:44
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:532
char * data
Pointer to data.
Definition: buffer.h:35
static struct LookupList Lookups
Definition: charset.c:80
size_t mutt_str_strfcpy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:750
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, int flags)
Set up iconv for conversions.
Definition: charset.c:559
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:803
Ignore case when comparing strings.
Definition: string2.h:68
char * ib
Definition: charset.h:49
char * mutt_ch_get_langinfo_charset(void)
Get the user&#39;s choice of character set.
Definition: charset.c:456
char * p
Definition: charset.h:47
void mutt_ch_lookup_remove(void)
Remove all the character set lookups.
Definition: charset.c:511
MIME name lookup entry.
Definition: charset.h:66
char bufo[512]
Definition: charset.h:46
void mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:453
enum LookupType type
Lookup type.
Definition: charset.c:73
struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:241
size_t mutt_str_startswith(const char *str, const char *prefix, enum CaseSensitivity cs)
Check whether a string starts with a prefix.
Definition: string.c:168
#define TAILQ_ENTRY(type)
Definition: queue.h:634
char * mutt_str_strcat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:395
void mutt_ch_fgetconv_close(struct FgetConv **fc)
Close an fgetconv handle.
Definition: charset.c:882
General purpose object for storing and parsing strings.
bool CharsetIsUtf8
Is the user&#39;s current character set utf-8?
Definition: charset.c:64
char * mutt_str_strdup(const char *str)
Copy a string, safely.
Definition: string.c:380
int mutt_ch_convert_nonmime_string(char **ps)
Try to convert a string using a list of character sets.
Definition: charset.c:301
Cached regular expression.
Definition: regex3.h:57
int mutt_str_strcasecmp(const char *a, const char *b)
Compare two strings ignoring case, safely.
Definition: string.c:628
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:610
#define FREE(x)
Definition: memory.h:40
char * mutt_ch_get_default_charset(void)
Get the default character set.
Definition: charset.c:434
char * mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
Convert a file&#39;s charset into a string buffer.
Definition: charset.c:964
#define EILSEQ
Definition: charset.c:50
Manage regular expressions.
char * C_Charset
Config: Default character set for displaying text on screen.
Definition: charset.c:54
char * pattern
printable version
Definition: regex3.h:59
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:81
struct FgetConv * mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, int flags)
Prepare a file for charset conversion.
Definition: charset.c:852
#define TAILQ_HEAD_INITIALIZER(head)
Definition: queue.h:631
char * mutt_str_substr_dup(const char *begin, const char *end)
Duplicate a sub-string.
Definition: string.c:579
TAILQ_HEAD(LookupList, Lookup)
Cursor for converting a file&#39;s encoding.
Definition: charset.h:41
Alias for another character set.
Definition: charset.h:77
char * ob
Definition: charset.h:48
const char ** inrepls
Definition: charset.h:51