NeoMutt  2021-02-05-329-g9e03b7
Teaching an old dog new tricks
DOXYGEN
charset.c
Go to the documentation of this file.
1 
29 #include "config.h"
30 #include <ctype.h>
31 #include <errno.h>
32 #include <iconv.h>
33 #include <langinfo.h>
34 #include <limits.h>
35 #include <stdbool.h>
36 #include <stdio.h>
37 #include <string.h>
38 #include "config/lib.h"
39 #include "core/lib.h"
40 #include "charset.h"
41 #include "buffer.h"
42 #include "memory.h"
43 #include "queue.h"
44 #include "regex3.h"
45 #include "string2.h"
46 #ifdef ENABLE_NLS
47 #include <libintl.h>
48 #endif
49 
50 #ifndef EILSEQ
51 #define EILSEQ EINVAL
52 #endif
53 
57 wchar_t ReplacementChar = '?';
58 
62 bool CharsetIsUtf8 = false;
63 
69 struct Lookup
70 {
72  struct Regex regex;
73  char *replacement;
74  TAILQ_ENTRY(Lookup) entries;
75 };
76 TAILQ_HEAD(LookupList, Lookup);
77 
78 static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
79 
83 struct MimeNames
84 {
85  const char *key;
86  const char *pref;
87 };
88 
89 // clang-format off
100 const struct MimeNames PreferredMimeNames[] =
101 {
102  { "ansi_x3.4-1968", "us-ascii" },
103  { "iso-ir-6", "us-ascii" },
104  { "iso_646.irv:1991", "us-ascii" },
105  { "ascii", "us-ascii" },
106  { "iso646-us", "us-ascii" },
107  { "us", "us-ascii" },
108  { "ibm367", "us-ascii" },
109  { "cp367", "us-ascii" },
110  { "csASCII", "us-ascii" },
111 
112  { "csISO2022KR", "iso-2022-kr" },
113  { "csEUCKR", "euc-kr" },
114  { "csISO2022JP", "iso-2022-jp" },
115  { "csISO2022JP2", "iso-2022-jp-2" },
116 
117  { "ISO_8859-1:1987", "iso-8859-1" },
118  { "iso-ir-100", "iso-8859-1" },
119  { "iso_8859-1", "iso-8859-1" },
120  { "latin1", "iso-8859-1" },
121  { "l1", "iso-8859-1" },
122  { "IBM819", "iso-8859-1" },
123  { "CP819", "iso-8859-1" },
124  { "csISOLatin1", "iso-8859-1" },
125 
126  { "ISO_8859-2:1987", "iso-8859-2" },
127  { "iso-ir-101", "iso-8859-2" },
128  { "iso_8859-2", "iso-8859-2" },
129  { "latin2", "iso-8859-2" },
130  { "l2", "iso-8859-2" },
131  { "csISOLatin2", "iso-8859-2" },
132 
133  { "ISO_8859-3:1988", "iso-8859-3" },
134  { "iso-ir-109", "iso-8859-3" },
135  { "ISO_8859-3", "iso-8859-3" },
136  { "latin3", "iso-8859-3" },
137  { "l3", "iso-8859-3" },
138  { "csISOLatin3", "iso-8859-3" },
139 
140  { "ISO_8859-4:1988", "iso-8859-4" },
141  { "iso-ir-110", "iso-8859-4" },
142  { "ISO_8859-4", "iso-8859-4" },
143  { "latin4", "iso-8859-4" },
144  { "l4", "iso-8859-4" },
145  { "csISOLatin4", "iso-8859-4" },
146 
147  { "ISO_8859-6:1987", "iso-8859-6" },
148  { "iso-ir-127", "iso-8859-6" },
149  { "iso_8859-6", "iso-8859-6" },
150  { "ECMA-114", "iso-8859-6" },
151  { "ASMO-708", "iso-8859-6" },
152  { "arabic", "iso-8859-6" },
153  { "csISOLatinArabic", "iso-8859-6" },
154 
155  { "ISO_8859-7:1987", "iso-8859-7" },
156  { "iso-ir-126", "iso-8859-7" },
157  { "ISO_8859-7", "iso-8859-7" },
158  { "ELOT_928", "iso-8859-7" },
159  { "ECMA-118", "iso-8859-7" },
160  { "greek", "iso-8859-7" },
161  { "greek8", "iso-8859-7" },
162  { "csISOLatinGreek", "iso-8859-7" },
163 
164  { "ISO_8859-8:1988", "iso-8859-8" },
165  { "iso-ir-138", "iso-8859-8" },
166  { "ISO_8859-8", "iso-8859-8" },
167  { "hebrew", "iso-8859-8" },
168  { "csISOLatinHebrew", "iso-8859-8" },
169 
170  { "ISO_8859-5:1988", "iso-8859-5" },
171  { "iso-ir-144", "iso-8859-5" },
172  { "ISO_8859-5", "iso-8859-5" },
173  { "cyrillic", "iso-8859-5" },
174  { "csISOLatinCyrillic", "iso-8859-5" },
175 
176  { "ISO_8859-9:1989", "iso-8859-9" },
177  { "iso-ir-148", "iso-8859-9" },
178  { "ISO_8859-9", "iso-8859-9" },
179  { "latin5", "iso-8859-9" }, /* this is not a bug */
180  { "l5", "iso-8859-9" },
181  { "csISOLatin5", "iso-8859-9" },
182 
183  { "ISO_8859-10:1992", "iso-8859-10" },
184  { "iso-ir-157", "iso-8859-10" },
185  { "latin6", "iso-8859-10" }, /* this is not a bug */
186  { "l6", "iso-8859-10" },
187  { "csISOLatin6", "iso-8859-10" },
188 
189  { "csKOI8r", "koi8-r" },
190 
191  { "MS_Kanji", "Shift_JIS" }, /* Note the underscore! */
192  { "csShiftJis", "Shift_JIS" },
193 
194  { "Extended_UNIX_Code_Packed_Format_for_Japanese",
195  "euc-jp" },
196  { "csEUCPkdFmtJapanese", "euc-jp" },
197 
198  { "csGB2312", "gb2312" },
199  { "csbig5", "big5" },
200 
201  /* End of official brain damage.
202  * What follows has been taken from glibc's localedata files. */
203 
204  { "iso_8859-13", "iso-8859-13" },
205  { "iso-ir-179", "iso-8859-13" },
206  { "latin7", "iso-8859-13" }, /* this is not a bug */
207  { "l7", "iso-8859-13" },
208 
209  { "iso_8859-14", "iso-8859-14" },
210  { "latin8", "iso-8859-14" }, /* this is not a bug */
211  { "l8", "iso-8859-14" },
212 
213  { "iso_8859-15", "iso-8859-15" },
214  { "latin9", "iso-8859-15" }, /* this is not a bug */
215 
216  /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
217  { "latin0", "iso-8859-15" }, /* this is not a bug */
218 
219  { "iso_8859-16", "iso-8859-16" },
220  { "latin10", "iso-8859-16" }, /* this is not a bug */
221 
222  { "646", "us-ascii" },
223 
224  /* http://www.sun.com/software/white-papers/wp-unicode/ */
225 
226  { "eucJP", "euc-jp" },
227  { "PCK", "Shift_JIS" },
228  { "ko_KR-euc", "euc-kr" },
229  { "zh_TW-big5", "big5" },
230 
231  /* seems to be common on some systems */
232 
233  { "sjis", "Shift_JIS" },
234  { "euc-jp-ms", "eucJP-ms" },
235 
236  /* If you happen to encounter system-specific brain-damage with respect to
237  * character set naming, please add it above this comment, and submit a patch
238  * to <neomutt-devel@neomutt.org> */
239 
240  { NULL, NULL },
241 };
242 // clang-format on
243 
248 static struct Lookup *lookup_new(void)
249 {
250  return mutt_mem_calloc(1, sizeof(struct Lookup));
251 }
252 
257 static void lookup_free(struct Lookup **ptr)
258 {
259  if (!ptr || !*ptr)
260  return;
261 
262  struct Lookup *l = *ptr;
263  FREE(&l->replacement);
264  FREE(&l->regex.pattern);
265  if (l->regex.regex)
266  regfree(l->regex.regex);
267  FREE(&l->regex.regex);
268  FREE(&l->regex);
269 
270  FREE(ptr);
271 }
272 
282 static const char *lookup_charset(enum LookupType type, const char *cs)
283 {
284  if (!cs)
285  return NULL;
286 
287  struct Lookup *l = NULL;
288 
289  TAILQ_FOREACH(l, &Lookups, entries)
290  {
291  if (l->type != type)
292  continue;
293  if (mutt_regex_match(&l->regex, cs))
294  return l->replacement;
295  }
296  return NULL;
297 }
298 
309 {
310  if (!ps)
311  return -1;
312 
313  char *u = *ps;
314  const size_t ulen = mutt_str_len(u);
315  if (ulen == 0)
316  return 0;
317 
318  const char *c1 = NULL;
319 
320  const char *const c_assumed_charset =
321  cs_subset_string(NeoMutt->sub, "assumed_charset");
322  const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
323  for (const char *c = c_assumed_charset; c; c = c1 ? c1 + 1 : 0)
324  {
325  c1 = strchr(c, ':');
326  size_t n = c1 ? c1 - c : mutt_str_len(c);
327  if (n == 0)
328  return 0;
329  char *fromcode = mutt_mem_malloc(n + 1);
330  mutt_str_copy(fromcode, c, n + 1);
331  char *s = mutt_strn_dup(u, ulen);
332  int m = mutt_ch_convert_string(&s, fromcode, c_charset, MUTT_ICONV_NO_FLAGS);
333  FREE(&fromcode);
334  FREE(&s);
335  if (m == 0)
336  {
337  return 0;
338  }
339  }
341  c_charset, MUTT_ICONV_HOOK_FROM);
342  return -1;
343 }
344 
354 void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
355 {
356  if (!buf || !name)
357  return;
358 
359  char in[1024], scratch[1024];
360 
361  mutt_str_copy(in, name, sizeof(in));
362  char *ext = strchr(in, '/');
363  if (ext)
364  *ext++ = '\0';
365 
366  if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
367  {
368  mutt_str_copy(buf, "utf-8", buflen);
369  goto out;
370  }
371 
372  /* catch some common iso-8859-something misspellings */
373  size_t plen;
374  if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
375  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
376  else if ((plen = mutt_istr_startswith(in, "8859-")))
377  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
378  else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
379  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
380  else if ((plen = mutt_istr_startswith(in, "iso8859-")))
381  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
382  else
383  mutt_str_copy(scratch, in, sizeof(scratch));
384 
385  for (size_t i = 0; PreferredMimeNames[i].key; i++)
386  {
387  if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
388  {
389  mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
390  goto out;
391  }
392  }
393 
394  mutt_str_copy(buf, scratch, buflen);
395 
396  /* for cosmetics' sake, transform to lowercase. */
397  for (char *p = buf; *p; p++)
398  *p = tolower(*p);
399 
400 out:
401  if (ext && *ext)
402  {
403  mutt_str_cat(buf, buflen, "/");
404  mutt_str_cat(buf, buflen, ext);
405  }
406 }
407 
420 bool mutt_ch_chscmp(const char *cs1, const char *cs2)
421 {
422  if (!cs1 || !cs2)
423  return false;
424 
425  char buf[256];
426 
427  mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
428 
429  int len1 = mutt_str_len(buf);
430  int len2 = mutt_str_len(cs2);
431 
432  return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
433  ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
434 }
435 
443 {
444  static char fcharset[128];
445  const char *const c_assumed_charset =
446  cs_subset_string(NeoMutt->sub, "assumed_charset");
447  const char *c = c_assumed_charset;
448  const char *c1 = NULL;
449 
450  if (c)
451  {
452  c1 = strchr(c, ':');
453  mutt_str_copy(fcharset, c, c1 ? (c1 - c + 1) : sizeof(fcharset));
454  return fcharset;
455  }
456  return strcpy(fcharset, "us-ascii");
457 }
458 
467 {
468  char buf[1024] = { 0 };
469 
470  mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
471 
472  if (buf[0] != '\0')
473  return mutt_str_dup(buf);
474 
475  return mutt_str_dup("iso-8859-1");
476 }
477 
489 bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
490  const char *replace, struct Buffer *err)
491 {
492  if (!pat || !replace)
493  return false;
494 
495  regex_t *rx = mutt_mem_malloc(sizeof(regex_t));
496  int rc = REG_COMP(rx, pat, REG_ICASE);
497  if (rc != 0)
498  {
499  regerror(rc, rx, err->data, err->dsize);
500  FREE(&rx);
501  return false;
502  }
503 
504  struct Lookup *l = lookup_new();
505  l->type = type;
506  l->replacement = mutt_str_dup(replace);
507  l->regex.pattern = mutt_str_dup(pat);
508  l->regex.regex = rx;
509  l->regex.pat_not = false;
510 
511  TAILQ_INSERT_TAIL(&Lookups, l, entries);
512 
513  return true;
514 }
515 
522 {
523  struct Lookup *l = NULL;
524  struct Lookup *tmp = NULL;
525 
526  TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
527  {
528  TAILQ_REMOVE(&Lookups, l, entries);
529  lookup_free(&l);
530  }
531 }
532 
542 const char *mutt_ch_charset_lookup(const char *chs)
543 {
544  return lookup_charset(MUTT_LOOKUP_CHARSET, chs);
545 }
546 
569 iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
570 {
571  char tocode1[128];
572  char fromcode1[128];
573  const char *tocode2 = NULL, *fromcode2 = NULL;
574  const char *tmp = NULL;
575 
576  iconv_t cd;
577 
578  /* transform to MIME preferred charset names */
579  mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
580  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
581 
582  /* maybe apply charset-hooks and recanonicalise fromcode,
583  * but only when caller asked us to sanitize a potentially wrong
584  * charset name incoming from the wild exterior. */
585  if (flags & MUTT_ICONV_HOOK_FROM)
586  {
587  tmp = mutt_ch_charset_lookup(fromcode1);
588  if (tmp)
589  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
590  }
591 
592  /* always apply iconv-hooks to suit system's iconv tastes */
593  tocode2 = mutt_ch_iconv_lookup(tocode1);
594  tocode2 = tocode2 ? tocode2 : tocode1;
595  fromcode2 = mutt_ch_iconv_lookup(fromcode1);
596  fromcode2 = fromcode2 ? fromcode2 : fromcode1;
597 
598  /* call system iconv with names it appreciates */
599  cd = iconv_open(tocode2, fromcode2);
600  if (cd != (iconv_t) -1)
601  return cd;
602 
603  return (iconv_t) -1;
604 }
605 
622 size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
623  char **outbuf, size_t *outbytesleft, const char **inrepls,
624  const char *outrepl, int *iconverrno)
625 {
626  size_t rc = 0;
627  const char *ib = *inbuf;
628  size_t ibl = *inbytesleft;
629  char *ob = *outbuf;
630  size_t obl = *outbytesleft;
631 
632  while (true)
633  {
634  errno = 0;
635  const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
636  if (ret1 != (size_t) -1)
637  rc += ret1;
638  if (iconverrno)
639  *iconverrno = errno;
640 
641  if (ibl && obl && (errno == EILSEQ))
642  {
643  if (inrepls)
644  {
645  /* Try replacing the input */
646  const char **t = NULL;
647  for (t = inrepls; *t; t++)
648  {
649  const char *ib1 = *t;
650  size_t ibl1 = strlen(*t);
651  char *ob1 = ob;
652  size_t obl1 = obl;
653  iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
654  if (ibl1 == 0)
655  {
656  ib++;
657  ibl--;
658  ob = ob1;
659  obl = obl1;
660  rc++;
661  break;
662  }
663  }
664  if (*t)
665  continue;
666  }
667  /* Replace the output */
668  if (!outrepl)
669  outrepl = "?";
670  iconv(cd, NULL, NULL, &ob, &obl);
671  if (obl)
672  {
673  int n = strlen(outrepl);
674  if (n > obl)
675  {
676  outrepl = "?";
677  n = 1;
678  }
679  memcpy(ob, outrepl, n);
680  ib++;
681  ibl--;
682  ob += n;
683  obl -= n;
684  rc++;
685  iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
686  continue;
687  }
688  }
689  *inbuf = ib;
690  *inbytesleft = ibl;
691  *outbuf = ob;
692  *outbytesleft = obl;
693  return rc;
694  }
695 }
696 
706 const char *mutt_ch_iconv_lookup(const char *chs)
707 {
708  return lookup_charset(MUTT_LOOKUP_ICONV, chs);
709 }
710 
721 int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
722 {
723  if (!s || !from || !to)
724  return -1;
725 
726  int rc = 0;
727  iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
728  if (cd == (iconv_t) -1)
729  return -1;
730 
731  size_t outlen = MB_LEN_MAX * slen;
732  char *out = mutt_mem_malloc(outlen + 1);
733  char *saved_out = out;
734 
735  const size_t convlen =
736  iconv(cd, (ICONV_CONST char **) &s, &slen, &out, (size_t *) &outlen);
737  if (convlen == -1)
738  rc = errno;
739 
740  FREE(&saved_out);
741  iconv_close(cd);
742  return rc;
743 }
744 
758 int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
759 {
760  if (!ps)
761  return -1;
762 
763  char *s = *ps;
764 
765  if (!s || (*s == '\0'))
766  return 0;
767 
768  if (!to || !from)
769  return -1;
770 
771  const char *repls[] = { "\357\277\275", "?", 0 };
772  int rc = 0;
773 
774  iconv_t cd = mutt_ch_iconv_open(to, from, flags);
775  if (cd == (iconv_t) -1)
776  return -1;
777 
778  size_t len;
779  const char *ib = NULL;
780  char *buf = NULL, *ob = NULL;
781  size_t ibl, obl;
782  const char **inrepls = NULL;
783  const char *outrepl = NULL;
784 
785  if (mutt_ch_is_utf8(to))
786  outrepl = "\357\277\275";
787  else if (mutt_ch_is_utf8(from))
788  inrepls = repls;
789  else
790  outrepl = "?";
791 
792  len = strlen(s);
793  ib = s;
794  ibl = len + 1;
795  obl = MB_LEN_MAX * ibl;
796  buf = mutt_mem_malloc(obl + 1);
797  ob = buf;
798 
799  mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
800  iconv_close(cd);
801 
802  *ob = '\0';
803 
804  FREE(ps);
805  *ps = buf;
806 
807  mutt_str_adjust(ps);
808  return rc;
809 }
810 
822 bool mutt_ch_check_charset(const char *cs, bool strict)
823 {
824  if (!cs)
825  return false;
826 
827  if (mutt_ch_is_utf8(cs))
828  return true;
829 
830  if (!strict)
831  {
832  for (int i = 0; PreferredMimeNames[i].key; i++)
833  {
834  if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
835  mutt_istr_equal(PreferredMimeNames[i].pref, cs))
836  {
837  return true;
838  }
839  }
840  }
841 
842  iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
843  if (cd != (iconv_t) (-1))
844  {
845  iconv_close(cd);
846  return true;
847  }
848 
849  return false;
850 }
851 
862 struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
863 {
864  struct FgetConv *fc = NULL;
865  iconv_t cd = (iconv_t) -1;
866 
867  if (from && to)
868  cd = mutt_ch_iconv_open(to, from, flags);
869 
870  if (cd != (iconv_t) -1)
871  {
872  static const char *repls[] = { "\357\277\275", "?", 0 };
873 
874  fc = mutt_mem_malloc(sizeof(struct FgetConv));
875  fc->p = fc->bufo;
876  fc->ob = fc->bufo;
877  fc->ib = fc->bufi;
878  fc->ibl = 0;
879  fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
880  }
881  else
882  fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
883  fc->fp = fp;
884  fc->cd = cd;
885  return fc;
886 }
887 
893 {
894  if (!fc || !*fc)
895  return;
896 
897  if ((*fc)->cd != (iconv_t) -1)
898  iconv_close((*fc)->cd);
899  FREE(fc);
900 }
901 
912 int mutt_ch_fgetconv(struct FgetConv *fc)
913 {
914  if (!fc)
915  return EOF;
916  if (fc->cd == (iconv_t) -1)
917  return fgetc(fc->fp);
918  if (!fc->p)
919  return EOF;
920  if (fc->p < fc->ob)
921  return (unsigned char) *(fc->p)++;
922 
923  /* Try to convert some more */
924  fc->p = fc->bufo;
925  fc->ob = fc->bufo;
926  if (fc->ibl)
927  {
928  size_t obl = sizeof(fc->bufo);
929  iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
930  if (fc->p < fc->ob)
931  return (unsigned char) *(fc->p)++;
932  }
933 
934  /* If we trusted iconv a bit more, we would at this point
935  * ask why it had stopped converting ... */
936 
937  /* Try to read some more */
938  if ((fc->ibl == sizeof(fc->bufi)) ||
939  (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
940  {
941  fc->p = 0;
942  return EOF;
943  }
944  if (fc->ibl)
945  memcpy(fc->bufi, fc->ib, fc->ibl);
946  fc->ib = fc->bufi;
947  fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
948 
949  /* Try harder this time to convert some */
950  if (fc->ibl)
951  {
952  size_t obl = sizeof(fc->bufo);
953  mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
954  fc->inrepls, 0, NULL);
955  if (fc->p < fc->ob)
956  return (unsigned char) *(fc->p)++;
957  }
958 
959  /* Either the file has finished or one of the buffers is too small */
960  fc->p = 0;
961  return EOF;
962 }
963 
974 char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
975 {
976  if (!buf)
977  return NULL;
978 
979  size_t r;
980  for (r = 0; (r + 1) < buflen;)
981  {
982  const int c = mutt_ch_fgetconv(fc);
983  if (c == EOF)
984  break;
985  buf[r++] = (char) c;
986  if (c == '\n')
987  break;
988  }
989  buf[r] = '\0';
990 
991  if (r > 0)
992  return buf;
993 
994  return NULL;
995 }
996 
1007 void mutt_ch_set_charset(const char *charset)
1008 {
1009  char buf[256];
1010 
1011  mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1012 
1013  if (mutt_ch_is_utf8(buf))
1014  {
1015  CharsetIsUtf8 = true;
1016  ReplacementChar = 0xfffd; /* replacement character */
1017  }
1018  else
1019  {
1020  CharsetIsUtf8 = false;
1021  ReplacementChar = '?';
1022  }
1023 
1024 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1025  bind_textdomain_codeset(PACKAGE, buf);
1026 #endif
1027 }
1028 
1040 char *mutt_ch_choose(const char *fromcode, const char *charsets, const char *u,
1041  size_t ulen, char **d, size_t *dlen)
1042 {
1043  if (!fromcode)
1044  return NULL;
1045 
1046  char *e = NULL, *tocode = NULL;
1047  size_t elen = 0, bestn = 0;
1048  const char *q = NULL;
1049 
1050  for (const char *p = charsets; p; p = q ? q + 1 : 0)
1051  {
1052  q = strchr(p, ':');
1053 
1054  size_t n = q ? q - p : strlen(p);
1055  if (n == 0)
1056  continue;
1057 
1058  char *t = mutt_mem_malloc(n + 1);
1059  memcpy(t, p, n);
1060  t[n] = '\0';
1061 
1062  char *s = mutt_strn_dup(u, ulen);
1063  const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1064  mutt_ch_check(s, ulen, fromcode, t);
1065  if (rc)
1066  {
1067  FREE(&t);
1068  FREE(&s);
1069  continue;
1070  }
1071  size_t slen = mutt_str_len(s);
1072 
1073  if (!tocode || (n < bestn))
1074  {
1075  bestn = n;
1076  FREE(&tocode);
1077  tocode = t;
1078  if (d)
1079  {
1080  FREE(&e);
1081  e = s;
1082  }
1083  else
1084  FREE(&s);
1085  elen = slen;
1086  }
1087  else
1088  {
1089  FREE(&t);
1090  FREE(&s);
1091  }
1092  }
1093  if (tocode)
1094  {
1095  if (d)
1096  *d = e;
1097  if (dlen)
1098  *dlen = elen;
1099 
1100  char canonical_buf[1024];
1101  mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1102  mutt_str_replace(&tocode, canonical_buf);
1103  }
1104  return tocode;
1105 }
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
void mutt_ch_set_charset(const char *charset)
Update the records for a new character set.
Definition: charset.c:1007
#define MIN(a, b)
Definition: memory.h:31
Memory management wrappers.
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:718
const char * key
Definition: charset.c:85
char bufi[512]
Definition: charset.h:44
char * mutt_ch_choose(const char *fromcode, const char *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
Figure the best charset to encode a string.
Definition: charset.c:1040
static size_t plen
Length of cached packet.
Definition: pgppacket.c:39
regex_t * regex
compiled expression
Definition: regex3.h:92
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:621
String manipulation buffer.
Definition: buffer.h:33
bool pat_not
do not match
Definition: regex3.h:93
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:728
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:370
char * replacement
Alternative charset to use.
Definition: charset.c:73
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:54
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:758
struct Regex regex
Regular expression.
Definition: charset.c:72
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
Container for Accounts, Notifications.
Definition: neomutt.h:36
void mutt_ch_lookup_remove(void)
Remove all the character set lookups.
Definition: charset.c:521
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:569
Convenience wrapper for the config headers.
String manipulation functions.
Character set conversion.
Definition: charset.h:68
TAILQ_HEAD(LookupList, Lookup)
char * mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
Convert a file&#39;s charset into a string buffer.
Definition: charset.c:974
Regex to String lookup table.
Definition: charset.c:69
bool mutt_ch_chscmp(const char *cs1, const char *cs2)
Are the names of two character sets equivalent?
Definition: charset.c:420
size_t dsize
Length of data.
Definition: buffer.h:37
bool mutt_ch_check_charset(const char *cs, bool strict)
Does iconv understand a character set?
Definition: charset.c:822
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:248
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:548
size_t ibl
Definition: charset.h:49
Convenience wrapper for the core headers.
#define EILSEQ
Definition: charset.c:51
static char * chs
Definition: gnupgparse.c:73
char * mutt_ch_get_default_charset(void)
Get the default character set.
Definition: charset.c:442
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:883
Alias for another character set.
Definition: charset.h:67
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:834
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:706
A dummy converter.
Definition: charset.h:56
FILE * fp
Definition: charset.h:42
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:490
iconv_t cd
Definition: charset.h:43
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:721
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:542
char * data
Pointer to data.
Definition: buffer.h:35
wchar_t ReplacementChar
When a Unicode character can&#39;t be displayed, use this instead.
Definition: charset.c:57
int mutt_ch_convert_nonmime_string(char **ps)
Try to convert a string using a list of character sets.
Definition: charset.c:308
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:282
const char * cs_subset_string(const struct ConfigSubset *sub, const char *name)
Get a string config item by name.
Definition: helpers.c:295
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:802
char * ib
Definition: charset.h:48
char * p
Definition: charset.h:46
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:172
MIME name lookup entry.
Definition: charset.c:83
char bufo[512]
Definition: charset.h:45
bool CharsetIsUtf8
Is the user&#39;s current character set utf-8?
Definition: charset.c:62
enum LookupType type
Lookup type.
Definition: charset.c:71
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:385
#define TAILQ_ENTRY(type)
Definition: queue.h:633
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:631
General purpose object for storing and parsing strings.
const char * pref
Definition: charset.c:86
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:622
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:716
void mutt_ch_fgetconv_close(struct FgetConv **fc)
Close an fgetconv handle.
Definition: charset.c:892
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:446
Cached regular expression.
Definition: regex3.h:89
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:613
#define FREE(x)
Definition: memory.h:40
struct ConfigSubset * sub
Inherited config items.
Definition: neomutt.h:39
static struct LookupList Lookups
Definition: charset.c:78
char * pattern
printable version
Definition: regex3.h:91
#define TAILQ_HEAD_INITIALIZER(head)
Definition: queue.h:630
Conversion between different character encodings.
char * mutt_ch_get_langinfo_charset(void)
Get the user&#39;s choice of character set.
Definition: charset.c:466
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:354
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file&#39;s character set.
Definition: charset.c:912
Cursor for converting a file&#39;s encoding.
Definition: charset.h:40
struct FgetConv * mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
Prepare a file for charset conversion.
Definition: charset.c:862
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:257
LookupType
Types of character set lookups.
Definition: charset.h:65
char * ob
Definition: charset.h:47
bool mutt_ch_lookup_add(enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
Add a new character set lookup.
Definition: charset.c:489
const char ** inrepls
Definition: charset.h:50
Manage regular expressions.