NeoMutt  2021-10-29-225-gb9986f
Teaching an old dog new tricks
DOXYGEN
charset.c
Go to the documentation of this file.
1 
29 #include "config.h"
30 #include <ctype.h>
31 #include <errno.h>
32 #include <iconv.h>
33 #include <langinfo.h>
34 #include <limits.h>
35 #include <stdbool.h>
36 #include <stdio.h>
37 #include <string.h>
38 #include "config/lib.h"
39 #include "core/lib.h"
40 #include "charset.h"
41 #include "buffer.h"
42 #include "memory.h"
43 #include "queue.h"
44 #include "regex3.h"
45 #include "string2.h"
46 #ifdef ENABLE_NLS
47 #include <libintl.h>
48 #endif
49 
50 #ifndef EILSEQ
51 #define EILSEQ EINVAL
52 #endif
53 
57 wchar_t ReplacementChar = '?';
58 
62 bool CharsetIsUtf8 = false;
63 
69 struct Lookup
70 {
71  enum LookupType type;
72  struct Regex regex;
73  char *replacement;
74  TAILQ_ENTRY(Lookup) entries;
75 };
76 TAILQ_HEAD(LookupList, Lookup);
77 
78 static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
79 
83 struct MimeNames
84 {
85  const char *key;
86  const char *pref;
87 };
88 
99 const struct MimeNames PreferredMimeNames[] = {
100  // clang-format off
101  { "ansi_x3.4-1968", "us-ascii" },
102  { "iso-ir-6", "us-ascii" },
103  { "iso_646.irv:1991", "us-ascii" },
104  { "ascii", "us-ascii" },
105  { "iso646-us", "us-ascii" },
106  { "us", "us-ascii" },
107  { "ibm367", "us-ascii" },
108  { "cp367", "us-ascii" },
109  { "csASCII", "us-ascii" },
110 
111  { "csISO2022KR", "iso-2022-kr" },
112  { "csEUCKR", "euc-kr" },
113  { "csISO2022JP", "iso-2022-jp" },
114  { "csISO2022JP2", "iso-2022-jp-2" },
115 
116  { "ISO_8859-1:1987", "iso-8859-1" },
117  { "iso-ir-100", "iso-8859-1" },
118  { "iso_8859-1", "iso-8859-1" },
119  { "latin1", "iso-8859-1" },
120  { "l1", "iso-8859-1" },
121  { "IBM819", "iso-8859-1" },
122  { "CP819", "iso-8859-1" },
123  { "csISOLatin1", "iso-8859-1" },
124 
125  { "ISO_8859-2:1987", "iso-8859-2" },
126  { "iso-ir-101", "iso-8859-2" },
127  { "iso_8859-2", "iso-8859-2" },
128  { "latin2", "iso-8859-2" },
129  { "l2", "iso-8859-2" },
130  { "csISOLatin2", "iso-8859-2" },
131 
132  { "ISO_8859-3:1988", "iso-8859-3" },
133  { "iso-ir-109", "iso-8859-3" },
134  { "ISO_8859-3", "iso-8859-3" },
135  { "latin3", "iso-8859-3" },
136  { "l3", "iso-8859-3" },
137  { "csISOLatin3", "iso-8859-3" },
138 
139  { "ISO_8859-4:1988", "iso-8859-4" },
140  { "iso-ir-110", "iso-8859-4" },
141  { "ISO_8859-4", "iso-8859-4" },
142  { "latin4", "iso-8859-4" },
143  { "l4", "iso-8859-4" },
144  { "csISOLatin4", "iso-8859-4" },
145 
146  { "ISO_8859-6:1987", "iso-8859-6" },
147  { "iso-ir-127", "iso-8859-6" },
148  { "iso_8859-6", "iso-8859-6" },
149  { "ECMA-114", "iso-8859-6" },
150  { "ASMO-708", "iso-8859-6" },
151  { "arabic", "iso-8859-6" },
152  { "csISOLatinArabic", "iso-8859-6" },
153 
154  { "ISO_8859-7:1987", "iso-8859-7" },
155  { "iso-ir-126", "iso-8859-7" },
156  { "ISO_8859-7", "iso-8859-7" },
157  { "ELOT_928", "iso-8859-7" },
158  { "ECMA-118", "iso-8859-7" },
159  { "greek", "iso-8859-7" },
160  { "greek8", "iso-8859-7" },
161  { "csISOLatinGreek", "iso-8859-7" },
162 
163  { "ISO_8859-8:1988", "iso-8859-8" },
164  { "iso-ir-138", "iso-8859-8" },
165  { "ISO_8859-8", "iso-8859-8" },
166  { "hebrew", "iso-8859-8" },
167  { "csISOLatinHebrew", "iso-8859-8" },
168 
169  { "ISO_8859-5:1988", "iso-8859-5" },
170  { "iso-ir-144", "iso-8859-5" },
171  { "ISO_8859-5", "iso-8859-5" },
172  { "cyrillic", "iso-8859-5" },
173  { "csISOLatinCyrillic", "iso-8859-5" },
174 
175  { "ISO_8859-9:1989", "iso-8859-9" },
176  { "iso-ir-148", "iso-8859-9" },
177  { "ISO_8859-9", "iso-8859-9" },
178  { "latin5", "iso-8859-9" }, /* this is not a bug */
179  { "l5", "iso-8859-9" },
180  { "csISOLatin5", "iso-8859-9" },
181 
182  { "ISO_8859-10:1992", "iso-8859-10" },
183  { "iso-ir-157", "iso-8859-10" },
184  { "latin6", "iso-8859-10" }, /* this is not a bug */
185  { "l6", "iso-8859-10" },
186  { "csISOLatin6", "iso-8859-10" },
187 
188  { "csKOI8r", "koi8-r" },
189 
190  { "MS_Kanji", "Shift_JIS" }, /* Note the underscore! */
191  { "csShiftJis", "Shift_JIS" },
192 
193  { "Extended_UNIX_Code_Packed_Format_for_Japanese",
194  "euc-jp" },
195  { "csEUCPkdFmtJapanese", "euc-jp" },
196 
197  { "csGB2312", "gb2312" },
198  { "csbig5", "big5" },
199 
200  /* End of official brain damage.
201  * What follows has been taken from glibc's localedata files. */
202 
203  { "iso_8859-13", "iso-8859-13" },
204  { "iso-ir-179", "iso-8859-13" },
205  { "latin7", "iso-8859-13" }, /* this is not a bug */
206  { "l7", "iso-8859-13" },
207 
208  { "iso_8859-14", "iso-8859-14" },
209  { "latin8", "iso-8859-14" }, /* this is not a bug */
210  { "l8", "iso-8859-14" },
211 
212  { "iso_8859-15", "iso-8859-15" },
213  { "latin9", "iso-8859-15" }, /* this is not a bug */
214 
215  /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
216  { "latin0", "iso-8859-15" }, /* this is not a bug */
217 
218  { "iso_8859-16", "iso-8859-16" },
219  { "latin10", "iso-8859-16" }, /* this is not a bug */
220 
221  { "646", "us-ascii" },
222 
223  /* http://www.sun.com/software/white-papers/wp-unicode/ */
224 
225  { "eucJP", "euc-jp" },
226  { "PCK", "Shift_JIS" },
227  { "ko_KR-euc", "euc-kr" },
228  { "zh_TW-big5", "big5" },
229 
230  /* seems to be common on some systems */
231 
232  { "sjis", "Shift_JIS" },
233  { "euc-jp-ms", "eucJP-ms" },
234 
235  /* If you happen to encounter system-specific brain-damage with respect to
236  * character set naming, please add it above this comment, and submit a patch
237  * to <neomutt-devel@neomutt.org> */
238 
239  { NULL, NULL },
240  // clang-format on
241 };
242 
247 static struct Lookup *lookup_new(void)
248 {
249  return mutt_mem_calloc(1, sizeof(struct Lookup));
250 }
251 
256 static void lookup_free(struct Lookup **ptr)
257 {
258  if (!ptr || !*ptr)
259  return;
260 
261  struct Lookup *l = *ptr;
262  FREE(&l->replacement);
263  FREE(&l->regex.pattern);
264  if (l->regex.regex)
265  regfree(l->regex.regex);
266  FREE(&l->regex.regex);
267  FREE(&l->regex);
268 
269  FREE(ptr);
270 }
271 
281 static const char *lookup_charset(enum LookupType type, const char *cs)
282 {
283  if (!cs)
284  return NULL;
285 
286  struct Lookup *l = NULL;
287 
288  TAILQ_FOREACH(l, &Lookups, entries)
289  {
290  if (l->type != type)
291  continue;
292  if (mutt_regex_match(&l->regex, cs))
293  return l->replacement;
294  }
295  return NULL;
296 }
297 
308 {
309  if (!ps)
310  return -1;
311 
312  char *u = *ps;
313  const size_t ulen = mutt_str_len(u);
314  if (ulen == 0)
315  return 0;
316 
317  const char *c1 = NULL;
318 
319  const char *const c_assumed_charset =
320  cs_subset_string(NeoMutt->sub, "assumed_charset");
321  const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
322  for (const char *c = c_assumed_charset; c; c = c1 ? c1 + 1 : 0)
323  {
324  c1 = strchr(c, ':');
325  size_t n = c1 ? c1 - c : mutt_str_len(c);
326  if (n == 0)
327  return 0;
328  char *fromcode = mutt_mem_malloc(n + 1);
329  mutt_str_copy(fromcode, c, n + 1);
330  char *s = mutt_strn_dup(u, ulen);
331  int m = mutt_ch_convert_string(&s, fromcode, c_charset, MUTT_ICONV_NO_FLAGS);
332  FREE(&fromcode);
333  if (m == 0)
334  {
335  FREE(ps);
336  *ps = s;
337  return 0;
338  }
339  FREE(&s);
340  }
342  c_charset, MUTT_ICONV_HOOK_FROM);
343  return -1;
344 }
345 
355 void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
356 {
357  if (!buf || !name)
358  return;
359 
360  char in[1024], scratch[1024 + 10];
361 
362  mutt_str_copy(in, name, sizeof(in));
363  char *ext = strchr(in, '/');
364  if (ext)
365  *ext++ = '\0';
366 
367  if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
368  {
369  mutt_str_copy(buf, "utf-8", buflen);
370  goto out;
371  }
372 
373  /* catch some common iso-8859-something misspellings */
374  size_t plen;
375  if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
376  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
377  else if ((plen = mutt_istr_startswith(in, "8859-")))
378  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
379  else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
380  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
381  else if ((plen = mutt_istr_startswith(in, "iso8859-")))
382  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
383  else
384  mutt_str_copy(scratch, in, sizeof(scratch));
385 
386  for (size_t i = 0; PreferredMimeNames[i].key; i++)
387  {
388  if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
389  {
390  mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
391  goto out;
392  }
393  }
394 
395  mutt_str_copy(buf, scratch, buflen);
396 
397  /* for cosmetics' sake, transform to lowercase. */
398  for (char *p = buf; *p; p++)
399  *p = tolower(*p);
400 
401 out:
402  if (ext && *ext)
403  {
404  mutt_str_cat(buf, buflen, "/");
405  mutt_str_cat(buf, buflen, ext);
406  }
407 }
408 
421 bool mutt_ch_chscmp(const char *cs1, const char *cs2)
422 {
423  if (!cs1 || !cs2)
424  return false;
425 
426  char buf[256];
427 
428  mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
429 
430  int len1 = mutt_str_len(buf);
431  int len2 = mutt_str_len(cs2);
432 
433  return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
434  ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
435 }
436 
444 {
445  static char fcharset[128];
446  const char *const c_assumed_charset =
447  cs_subset_string(NeoMutt->sub, "assumed_charset");
448  const char *c = c_assumed_charset;
449  const char *c1 = NULL;
450 
451  if (c)
452  {
453  c1 = strchr(c, ':');
454 
455  size_t copysize;
456  if (c1)
457  copysize = MIN((c1 - c + 1), sizeof(fcharset));
458  else
459  copysize = sizeof(fcharset);
460  mutt_str_copy(fcharset, c, copysize);
461  return fcharset;
462  }
463  return strcpy(fcharset, "us-ascii");
464 }
465 
474 {
475  char buf[1024] = { 0 };
476 
477  mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
478 
479  if (buf[0] != '\0')
480  return mutt_str_dup(buf);
481 
482  return mutt_str_dup("iso-8859-1");
483 }
484 
496 bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
497  const char *replace, struct Buffer *err)
498 {
499  if (!pat || !replace)
500  return false;
501 
502  regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
503  int rc = REG_COMP(rx, pat, REG_ICASE);
504  if (rc != 0)
505  {
506  regerror(rc, rx, err->data, err->dsize);
507  FREE(&rx);
508  return false;
509  }
510 
511  struct Lookup *l = lookup_new();
512  l->type = type;
513  l->replacement = mutt_str_dup(replace);
514  l->regex.pattern = mutt_str_dup(pat);
515  l->regex.regex = rx;
516  l->regex.pat_not = false;
517 
518  TAILQ_INSERT_TAIL(&Lookups, l, entries);
519 
520  return true;
521 }
522 
529 {
530  struct Lookup *l = NULL;
531  struct Lookup *tmp = NULL;
532 
533  TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
534  {
535  TAILQ_REMOVE(&Lookups, l, entries);
536  lookup_free(&l);
537  }
538 }
539 
549 const char *mutt_ch_charset_lookup(const char *chs)
550 {
552 }
553 
576 iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
577 {
578  char tocode1[128];
579  char fromcode1[128];
580  const char *tocode2 = NULL, *fromcode2 = NULL;
581  const char *tmp = NULL;
582 
583  iconv_t cd;
584 
585  /* transform to MIME preferred charset names */
586  mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
587  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
588 
589  /* maybe apply charset-hooks and recanonicalise fromcode,
590  * but only when caller asked us to sanitize a potentially wrong
591  * charset name incoming from the wild exterior. */
592  if (flags & MUTT_ICONV_HOOK_FROM)
593  {
594  tmp = mutt_ch_charset_lookup(fromcode1);
595  if (tmp)
596  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
597  }
598 
599  /* always apply iconv-hooks to suit system's iconv tastes */
600  tocode2 = mutt_ch_iconv_lookup(tocode1);
601  tocode2 = tocode2 ? tocode2 : tocode1;
602  fromcode2 = mutt_ch_iconv_lookup(fromcode1);
603  fromcode2 = fromcode2 ? fromcode2 : fromcode1;
604 
605  /* call system iconv with names it appreciates */
606  cd = iconv_open(tocode2, fromcode2);
607  if (cd != (iconv_t) -1)
608  return cd;
609 
610  return (iconv_t) -1;
611 }
612 
629 size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
630  char **outbuf, size_t *outbytesleft, const char **inrepls,
631  const char *outrepl, int *iconverrno)
632 {
633  size_t rc = 0;
634  const char *ib = *inbuf;
635  size_t ibl = *inbytesleft;
636  char *ob = *outbuf;
637  size_t obl = *outbytesleft;
638 
639  while (true)
640  {
641  errno = 0;
642  const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
643  if (ret1 != (size_t) -1)
644  rc += ret1;
645  if (iconverrno)
646  *iconverrno = errno;
647 
648  if (ibl && obl && (errno == EILSEQ))
649  {
650  if (inrepls)
651  {
652  /* Try replacing the input */
653  const char **t = NULL;
654  for (t = inrepls; *t; t++)
655  {
656  const char *ib1 = *t;
657  size_t ibl1 = strlen(*t);
658  char *ob1 = ob;
659  size_t obl1 = obl;
660  iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
661  if (ibl1 == 0)
662  {
663  ib++;
664  ibl--;
665  ob = ob1;
666  obl = obl1;
667  rc++;
668  break;
669  }
670  }
671  if (*t)
672  continue;
673  }
674  /* Replace the output */
675  if (!outrepl)
676  outrepl = "?";
677  iconv(cd, NULL, NULL, &ob, &obl);
678  if (obl)
679  {
680  int n = strlen(outrepl);
681  if (n > obl)
682  {
683  outrepl = "?";
684  n = 1;
685  }
686  memcpy(ob, outrepl, n);
687  ib++;
688  ibl--;
689  ob += n;
690  obl -= n;
691  rc++;
692  iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
693  continue;
694  }
695  }
696  *inbuf = ib;
697  *inbytesleft = ibl;
698  *outbuf = ob;
699  *outbytesleft = obl;
700  return rc;
701  }
702 }
703 
713 const char *mutt_ch_iconv_lookup(const char *chs)
714 {
716 }
717 
728 int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
729 {
730  if (!s || !from || !to)
731  return -1;
732 
733  int rc = 0;
734  iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
735  if (cd == (iconv_t) -1)
736  return -1;
737 
738  size_t outlen = MB_LEN_MAX * slen;
739  char *out = mutt_mem_malloc(outlen + 1);
740  char *saved_out = out;
741 
742  const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
743  if (convlen == -1)
744  rc = errno;
745 
746  FREE(&saved_out);
747  iconv_close(cd);
748  return rc;
749 }
750 
764 int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
765 {
766  if (!ps)
767  return -1;
768 
769  char *s = *ps;
770 
771  if (!s || (*s == '\0'))
772  return 0;
773 
774  if (!to || !from)
775  return -1;
776 
777  const char *repls[] = { "\357\277\275", "?", 0 };
778  int rc = 0;
779 
780  iconv_t cd = mutt_ch_iconv_open(to, from, flags);
781  if (cd == (iconv_t) -1)
782  return -1;
783 
784  size_t len;
785  const char *ib = NULL;
786  char *buf = NULL, *ob = NULL;
787  size_t ibl, obl;
788  const char **inrepls = NULL;
789  const char *outrepl = NULL;
790 
791  if (mutt_ch_is_utf8(to))
792  outrepl = "\357\277\275";
793  else if (mutt_ch_is_utf8(from))
794  inrepls = repls;
795  else
796  outrepl = "?";
797 
798  len = strlen(s);
799  ib = s;
800  ibl = len + 1;
801  obl = MB_LEN_MAX * ibl;
802  buf = mutt_mem_malloc(obl + 1);
803  ob = buf;
804 
805  mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
806  iconv_close(cd);
807 
808  *ob = '\0';
809 
810  FREE(ps);
811  *ps = buf;
812 
813  mutt_str_adjust(ps);
814  return rc;
815 }
816 
828 bool mutt_ch_check_charset(const char *cs, bool strict)
829 {
830  if (!cs)
831  return false;
832 
833  if (mutt_ch_is_utf8(cs))
834  return true;
835 
836  if (!strict)
837  {
838  for (int i = 0; PreferredMimeNames[i].key; i++)
839  {
840  if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
841  mutt_istr_equal(PreferredMimeNames[i].pref, cs))
842  {
843  return true;
844  }
845  }
846  }
847 
848  iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
849  if (cd != (iconv_t) (-1))
850  {
851  iconv_close(cd);
852  return true;
853  }
854 
855  return false;
856 }
857 
868 struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
869 {
870  struct FgetConv *fc = NULL;
871  iconv_t cd = (iconv_t) -1;
872 
873  if (from && to)
874  cd = mutt_ch_iconv_open(to, from, flags);
875 
876  if (cd != (iconv_t) -1)
877  {
878  static const char *repls[] = { "\357\277\275", "?", 0 };
879 
880  fc = mutt_mem_malloc(sizeof(struct FgetConv));
881  fc->p = fc->bufo;
882  fc->ob = fc->bufo;
883  fc->ib = fc->bufi;
884  fc->ibl = 0;
885  fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
886  }
887  else
888  fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
889  fc->fp = fp;
890  fc->cd = cd;
891  return fc;
892 }
893 
899 {
900  if (!fc || !*fc)
901  return;
902 
903  if ((*fc)->cd != (iconv_t) -1)
904  iconv_close((*fc)->cd);
905  FREE(fc);
906 }
907 
918 int mutt_ch_fgetconv(struct FgetConv *fc)
919 {
920  if (!fc)
921  return EOF;
922  if (fc->cd == (iconv_t) -1)
923  return fgetc(fc->fp);
924  if (!fc->p)
925  return EOF;
926  if (fc->p < fc->ob)
927  return (unsigned char) *(fc->p)++;
928 
929  /* Try to convert some more */
930  fc->p = fc->bufo;
931  fc->ob = fc->bufo;
932  if (fc->ibl)
933  {
934  size_t obl = sizeof(fc->bufo);
935  iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
936  if (fc->p < fc->ob)
937  return (unsigned char) *(fc->p)++;
938  }
939 
940  /* If we trusted iconv a bit more, we would at this point
941  * ask why it had stopped converting ... */
942 
943  /* Try to read some more */
944  if ((fc->ibl == sizeof(fc->bufi)) ||
945  (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
946  {
947  fc->p = 0;
948  return EOF;
949  }
950  if (fc->ibl)
951  memcpy(fc->bufi, fc->ib, fc->ibl);
952  fc->ib = fc->bufi;
953  fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
954 
955  /* Try harder this time to convert some */
956  if (fc->ibl)
957  {
958  size_t obl = sizeof(fc->bufo);
959  mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
960  fc->inrepls, 0, NULL);
961  if (fc->p < fc->ob)
962  return (unsigned char) *(fc->p)++;
963  }
964 
965  /* Either the file has finished or one of the buffers is too small */
966  fc->p = 0;
967  return EOF;
968 }
969 
980 char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
981 {
982  if (!buf)
983  return NULL;
984 
985  size_t r;
986  for (r = 0; (r + 1) < buflen;)
987  {
988  const int c = mutt_ch_fgetconv(fc);
989  if (c == EOF)
990  break;
991  buf[r++] = (char) c;
992  if (c == '\n')
993  break;
994  }
995  buf[r] = '\0';
996 
997  if (r > 0)
998  return buf;
999 
1000  return NULL;
1001 }
1002 
1013 void mutt_ch_set_charset(const char *charset)
1014 {
1015  char buf[256];
1016 
1017  mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1018 
1019  if (mutt_ch_is_utf8(buf))
1020  {
1021  CharsetIsUtf8 = true;
1022  ReplacementChar = 0xfffd; /* replacement character */
1023  }
1024  else
1025  {
1026  CharsetIsUtf8 = false;
1027  ReplacementChar = '?';
1028  }
1029 
1030 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1031  bind_textdomain_codeset(PACKAGE, buf);
1032 #endif
1033 }
1034 
1046 char *mutt_ch_choose(const char *fromcode, const char *charsets, const char *u,
1047  size_t ulen, char **d, size_t *dlen)
1048 {
1049  if (!fromcode)
1050  return NULL;
1051 
1052  char *e = NULL, *tocode = NULL;
1053  size_t elen = 0, bestn = 0;
1054  const char *q = NULL;
1055 
1056  for (const char *p = charsets; p; p = q ? q + 1 : 0)
1057  {
1058  q = strchr(p, ':');
1059 
1060  size_t n = q ? q - p : strlen(p);
1061  if (n == 0)
1062  continue;
1063 
1064  char *t = mutt_mem_malloc(n + 1);
1065  memcpy(t, p, n);
1066  t[n] = '\0';
1067 
1068  char *s = mutt_strn_dup(u, ulen);
1069  const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1070  mutt_ch_check(s, ulen, fromcode, t);
1071  if (rc)
1072  {
1073  FREE(&t);
1074  FREE(&s);
1075  continue;
1076  }
1077  size_t slen = mutt_str_len(s);
1078 
1079  if (!tocode || (n < bestn))
1080  {
1081  bestn = n;
1082  FREE(&tocode);
1083  tocode = t;
1084  if (d)
1085  {
1086  FREE(&e);
1087  e = s;
1088  }
1089  else
1090  FREE(&s);
1091  elen = slen;
1092  }
1093  else
1094  {
1095  FREE(&t);
1096  FREE(&s);
1097  }
1098  }
1099  if (tocode)
1100  {
1101  if (d)
1102  *d = e;
1103  if (dlen)
1104  *dlen = elen;
1105 
1106  char canonical_buf[1024];
1107  mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1108  mutt_str_replace(&tocode, canonical_buf);
1109  }
1110  return tocode;
1111 }
General purpose object for storing and parsing strings.
Convenience wrapper for the config headers.
Convenience wrapper for the core headers.
static char * chs
Definition: gnupgparse.c:73
const char * cs_subset_string(const struct ConfigSubset *sub, const char *name)
Get a string config item by name.
Definition: helpers.c:317
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
Memory management wrappers.
#define FREE(x)
Definition: memory.h:40
#define MIN(a, b)
Definition: memory.h:31
bool mutt_ch_check_charset(const char *cs, bool strict)
Does iconv understand a character set?
Definition: charset.c:828
void mutt_ch_fgetconv_close(struct FgetConv **fc)
Close an fgetconv handle.
Definition: charset.c:898
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:629
void mutt_ch_lookup_remove(void)
Remove all the character set lookups.
Definition: charset.c:528
char * mutt_ch_get_langinfo_charset(void)
Get the user's choice of character set.
Definition: charset.c:473
char * mutt_ch_choose(const char *fromcode, const char *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
Figure the best charset to encode a string.
Definition: charset.c:1046
static struct LookupList Lookups
Definition: charset.c:78
const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:99
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:247
bool mutt_ch_lookup_add(enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
Add a new character set lookup.
Definition: charset.c:496
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:355
TAILQ_HEAD(LookupList, Lookup)
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:764
struct FgetConv * mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
Prepare a file for charset conversion.
Definition: charset.c:868
void mutt_ch_set_charset(const char *charset)
Update the records for a new character set.
Definition: charset.c:1013
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:62
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:728
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:918
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:549
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:256
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:57
#define EILSEQ
Definition: charset.c:51
char * mutt_ch_get_default_charset(void)
Get the default character set.
Definition: charset.c:443
int mutt_ch_convert_nonmime_string(char **ps)
Try to convert a string using a list of character sets.
Definition: charset.c:307
char * mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
Convert a file's charset into a string buffer.
Definition: charset.c:980
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:713
bool mutt_ch_chscmp(const char *cs1, const char *cs2)
Are the names of two character sets equivalent?
Definition: charset.c:421
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:281
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:576
Conversion between different character encodings.
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
LookupType
Types of character set lookups.
Definition: charset.h:66
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:68
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:67
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
bool mutt_regex_match(const struct Regex *regex, const char *str)
Shorthand to mutt_regex_capture()
Definition: regex.c:629
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:727
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:359
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:181
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:301
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:475
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:560
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:196
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:257
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:170
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:432
static size_t plen
Length of cached packet.
Definition: pgppacket.c:39
#define TAILQ_FOREACH(var, head, field)
Definition: queue.h:725
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
#define TAILQ_HEAD_INITIALIZER(head)
Definition: queue.h:637
#define TAILQ_ENTRY(type)
Definition: queue.h:640
Manage regular expressions.
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:54
String manipulation functions.
String manipulation buffer.
Definition: buffer.h:34
size_t dsize
Length of data.
Definition: buffer.h:37
char * data
Pointer to data.
Definition: buffer.h:35
A dummy converter.
Definition: charset.h:57
Cursor for converting a file's encoding.
Definition: charset.h:41
char bufi[512]
Definition: charset.h:44
iconv_t cd
Definition: charset.h:43
char bufo[512]
Definition: charset.h:45
size_t ibl
Definition: charset.h:49
FILE * fp
Definition: charset.h:42
char * p
Definition: charset.h:46
const char ** inrepls
Definition: charset.h:50
char * ib
Definition: charset.h:48
char * ob
Definition: charset.h:47
Regex to String lookup table.
Definition: charset.c:70
char * replacement
Alternative charset to use.
Definition: charset.c:73
enum LookupType type
Lookup type.
Definition: charset.c:71
struct Regex regex
Regular expression.
Definition: charset.c:72
MIME name lookup entry.
Definition: charset.c:84
const char * key
Definition: charset.c:85
const char * pref
Definition: charset.c:86
Container for Accounts, Notifications.
Definition: neomutt.h:37
struct ConfigSubset * sub
Inherited config items.
Definition: neomutt.h:39
Cached regular expression.
Definition: regex3.h:90
char * pattern
printable version
Definition: regex3.h:91
bool pat_not
do not match
Definition: regex3.h:93
regex_t * regex
compiled expression
Definition: regex3.h:92