NeoMutt  2021-10-29-220-g2b1eec
Teaching an old dog new tricks
DOXYGEN
charset.h File Reference

Conversion between different character encodings. More...

#include <iconv.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <wchar.h>
+ Include dependency graph for charset.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  FgetConv
 Cursor for converting a file's encoding. More...
 
struct  FgetConvNot
 A dummy converter. More...
 

Macros

#define MUTT_ICONV_NO_FLAGS   0
 No flags are set. More...
 
#define MUTT_ICONV_HOOK_FROM   1
 apply charset-hooks to fromcode More...
 
#define mutt_ch_is_utf8(str)   mutt_ch_chscmp(str, "utf-8")
 
#define mutt_ch_is_us_ascii(str)   mutt_ch_chscmp(str, "us-ascii")
 

Enumerations

enum  LookupType { MUTT_LOOKUP_CHARSET , MUTT_LOOKUP_ICONV }
 Types of character set lookups. More...
 

Functions

void mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name)
 Canonicalise the charset of a string. More...
 
const char * mutt_ch_charset_lookup (const char *chs)
 Look for a replacement character set. More...
 
int mutt_ch_check (const char *s, size_t slen, const char *from, const char *to)
 Check whether a string can be converted between encodings. More...
 
bool mutt_ch_check_charset (const char *cs, bool strict)
 Does iconv understand a character set? More...
 
char * mutt_ch_choose (const char *fromcode, const char *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
 Figure the best charset to encode a string. More...
 
bool mutt_ch_chscmp (const char *cs1, const char *cs2)
 Are the names of two character sets equivalent? More...
 
int mutt_ch_convert_nonmime_string (char **ps)
 Try to convert a string using a list of character sets. More...
 
int mutt_ch_convert_string (char **ps, const char *from, const char *to, uint8_t flags)
 Convert a string between encodings. More...
 
int mutt_ch_fgetconv (struct FgetConv *fc)
 Convert a file's character set. More...
 
void mutt_ch_fgetconv_close (struct FgetConv **fc)
 Close an fgetconv handle. More...
 
struct FgetConvmutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, uint8_t flags)
 Prepare a file for charset conversion. More...
 
char * mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc)
 Convert a file's charset into a string buffer. More...
 
char * mutt_ch_get_default_charset (void)
 Get the default character set. More...
 
char * mutt_ch_get_langinfo_charset (void)
 Get the user's choice of character set. More...
 
size_t mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
 Change the encoding of a string. More...
 
const char * mutt_ch_iconv_lookup (const char *chs)
 Look for a replacement character set. More...
 
iconv_t mutt_ch_iconv_open (const char *tocode, const char *fromcode, uint8_t flags)
 Set up iconv for conversions. More...
 
bool mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
 Add a new character set lookup. More...
 
void mutt_ch_lookup_remove (void)
 Remove all the character set lookups. More...
 
void mutt_ch_set_charset (const char *charset)
 Update the records for a new character set. More...
 

Variables

bool CharsetIsUtf8
 Is the user's current character set utf-8? More...
 
wchar_t ReplacementChar
 When a Unicode character can't be displayed, use this instead. More...
 

Detailed Description

Conversion between different character encodings.

Authors
  • Thomas Roessler

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.h.

Macro Definition Documentation

◆ MUTT_ICONV_NO_FLAGS

#define MUTT_ICONV_NO_FLAGS   0

No flags are set.

Definition at line 71 of file charset.h.

◆ MUTT_ICONV_HOOK_FROM

#define MUTT_ICONV_HOOK_FROM   1

apply charset-hooks to fromcode

Definition at line 72 of file charset.h.

◆ mutt_ch_is_utf8

#define mutt_ch_is_utf8 (   str)    mutt_ch_chscmp(str, "utf-8")

Definition at line 95 of file charset.h.

◆ mutt_ch_is_us_ascii

#define mutt_ch_is_us_ascii (   str)    mutt_ch_chscmp(str, "us-ascii")

Definition at line 96 of file charset.h.

Enumeration Type Documentation

◆ LookupType

enum LookupType

Types of character set lookups.

Enumerator
MUTT_LOOKUP_CHARSET 

Alias for another character set.

MUTT_LOOKUP_ICONV 

Character set conversion.

Definition at line 65 of file charset.h.

66 {
69 };
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:68
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:67

Function Documentation

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset ( char *  buf,
size_t  buflen,
const char *  name 
)

Canonicalise the charset of a string.

Parameters
bufBuffer for canonical character set name
buflenLength of buffer
nameName to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 355 of file charset.c.

356 {
357  if (!buf || !name)
358  return;
359 
360  char in[1024], scratch[1024 + 10];
361 
362  mutt_str_copy(in, name, sizeof(in));
363  char *ext = strchr(in, '/');
364  if (ext)
365  *ext++ = '\0';
366 
367  if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
368  {
369  mutt_str_copy(buf, "utf-8", buflen);
370  goto out;
371  }
372 
373  /* catch some common iso-8859-something misspellings */
374  size_t plen;
375  if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
376  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
377  else if ((plen = mutt_istr_startswith(in, "8859-")))
378  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
379  else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
380  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
381  else if ((plen = mutt_istr_startswith(in, "iso8859-")))
382  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
383  else
384  mutt_str_copy(scratch, in, sizeof(scratch));
385 
386  for (size_t i = 0; PreferredMimeNames[i].key; i++)
387  {
388  if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
389  {
390  mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
391  goto out;
392  }
393  }
394 
395  mutt_str_copy(buf, scratch, buflen);
396 
397  /* for cosmetics' sake, transform to lowercase. */
398  for (char *p = buf; *p; p++)
399  *p = tolower(*p);
400 
401 out:
402  if (ext && *ext)
403  {
404  mutt_str_cat(buf, buflen, "/");
405  mutt_str_cat(buf, buflen, ext);
406  }
407 }
const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:99
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:727
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:560
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:196
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:170
static size_t plen
Length of cached packet.
Definition: pgppacket.c:39
const char * key
Definition: charset.c:85
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char* mutt_ch_charset_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'charset-hook' matches)
NULLNo matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 549 of file charset.c.

550 {
552 }
static char * chs
Definition: gnupgparse.c:73
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:281
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check ( const char *  s,
size_t  slen,
const char *  from,
const char *  to 
)

Check whether a string can be converted between encodings.

Parameters
[in]sString to check
[in]slenLength of the string to check
[in]fromCurrent character set
[in]toTarget character set
Return values
0Success
-1Error in iconv_open()
>0Errno as set by iconv()

Definition at line 728 of file charset.c.

729 {
730  if (!s || !from || !to)
731  return -1;
732 
733  int rc = 0;
734  iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
735  if (cd == (iconv_t) -1)
736  return -1;
737 
738  size_t outlen = MB_LEN_MAX * slen;
739  char *out = mutt_mem_malloc(outlen + 1);
740  char *saved_out = out;
741 
742  const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
743  if (convlen == -1)
744  rc = errno;
745 
746  FREE(&saved_out);
747  iconv_close(cd);
748  return rc;
749 }
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
#define FREE(x)
Definition: memory.h:40
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:576
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
+ Here is the call graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset ( const char *  cs,
bool  strict 
)

Does iconv understand a character set?

Parameters
csCharacter set to check
strictCheck strictly by using iconv
Return values
trueCharacter set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 828 of file charset.c.

829 {
830  if (!cs)
831  return false;
832 
833  if (mutt_ch_is_utf8(cs))
834  return true;
835 
836  if (!strict)
837  {
838  for (int i = 0; PreferredMimeNames[i].key; i++)
839  {
840  if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
841  mutt_istr_equal(PreferredMimeNames[i].pref, cs))
842  {
843  return true;
844  }
845  }
846  }
847 
848  iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
849  if (cd != (iconv_t) (-1))
850  {
851  iconv_close(cd);
852  return true;
853  }
854 
855  return false;
856 }
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_choose()

char* mutt_ch_choose ( const char *  fromcode,
const char *  charsets,
const char *  u,
size_t  ulen,
char **  d,
size_t *  dlen 
)

Figure the best charset to encode a string.

Parameters
[in]fromcodeOriginal charset of the string
[in]charsetsColon-separated list of potential charsets to use
[in]uString to encode
[in]ulenLength of the string to encode
[out]dIf not NULL, point it to the converted string
[out]dlenIf not NULL, point it to the length of the d string
Return values
ptrBest performing charset
NULLNone could be found

Definition at line 1046 of file charset.c.

1048 {
1049  if (!fromcode)
1050  return NULL;
1051 
1052  char *e = NULL, *tocode = NULL;
1053  size_t elen = 0, bestn = 0;
1054  const char *q = NULL;
1055 
1056  for (const char *p = charsets; p; p = q ? q + 1 : 0)
1057  {
1058  q = strchr(p, ':');
1059 
1060  size_t n = q ? q - p : strlen(p);
1061  if (n == 0)
1062  continue;
1063 
1064  char *t = mutt_mem_malloc(n + 1);
1065  memcpy(t, p, n);
1066  t[n] = '\0';
1067 
1068  char *s = mutt_strn_dup(u, ulen);
1069  const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1070  mutt_ch_check(s, ulen, fromcode, t);
1071  if (rc)
1072  {
1073  FREE(&t);
1074  FREE(&s);
1075  continue;
1076  }
1077  size_t slen = mutt_str_len(s);
1078 
1079  if (!tocode || (n < bestn))
1080  {
1081  bestn = n;
1082  FREE(&tocode);
1083  tocode = t;
1084  if (d)
1085  {
1086  FREE(&e);
1087  e = s;
1088  }
1089  else
1090  FREE(&s);
1091  elen = slen;
1092  }
1093  else
1094  {
1095  FREE(&t);
1096  FREE(&s);
1097  }
1098  }
1099  if (tocode)
1100  {
1101  if (d)
1102  *d = e;
1103  if (dlen)
1104  *dlen = elen;
1105 
1106  char canonical_buf[1024];
1107  mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1108  mutt_str_replace(&tocode, canonical_buf);
1109  }
1110  return tocode;
1111 }
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:355
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:764
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:728
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:359
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:475
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:257
+ Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp ( const char *  cs1,
const char *  cs2 
)

Are the names of two character sets equivalent?

Parameters
cs1First character set
cs2Second character set
Return values
trueNames are equivalent
falseNames differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 421 of file charset.c.

422 {
423  if (!cs1 || !cs2)
424  return false;
425 
426  char buf[256];
427 
428  mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
429 
430  int len1 = mutt_str_len(buf);
431  int len2 = mutt_str_len(cs2);
432 
433  return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
434  ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
435 }
#define MIN(a, b)
Definition: memory.h:31
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:432
+ Here is the call graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string ( char **  ps)

Try to convert a string using a list of character sets.

Parameters
[in,out]psString to be converted
Return values
0Success
-1Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 307 of file charset.c.

308 {
309  if (!ps)
310  return -1;
311 
312  char *u = *ps;
313  const size_t ulen = mutt_str_len(u);
314  if (ulen == 0)
315  return 0;
316 
317  const char *c1 = NULL;
318 
319  const char *const c_assumed_charset =
320  cs_subset_string(NeoMutt->sub, "assumed_charset");
321  const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
322  for (const char *c = c_assumed_charset; c; c = c1 ? c1 + 1 : 0)
323  {
324  c1 = strchr(c, ':');
325  size_t n = c1 ? c1 - c : mutt_str_len(c);
326  if (n == 0)
327  return 0;
328  char *fromcode = mutt_mem_malloc(n + 1);
329  mutt_str_copy(fromcode, c, n + 1);
330  char *s = mutt_strn_dup(u, ulen);
331  int m = mutt_ch_convert_string(&s, fromcode, c_charset, MUTT_ICONV_NO_FLAGS);
332  FREE(&fromcode);
333  if (m == 0)
334  {
335  FREE(ps);
336  *ps = s;
337  return 0;
338  }
339  FREE(&s);
340  }
342  c_charset, MUTT_ICONV_HOOK_FROM);
343  return -1;
344 }
const char * cs_subset_string(const struct ConfigSubset *sub, const char *name)
Get a string config item by name.
Definition: helpers.c:317
char * mutt_ch_get_default_charset(void)
Get the default character set.
Definition: charset.c:443
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
Container for Accounts, Notifications.
Definition: neomutt.h:37
struct ConfigSubset * sub
Inherited config items.
Definition: neomutt.h:39
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string ( char **  ps,
const char *  from,
const char *  to,
uint8_t  flags 
)

Convert a string between encodings.

Parameters
[in,out]psString to convert
[in]fromCurrent character set
[in]toTarget character set
[in]flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
0Success
-1Invalid arguments or failure to open an iconv channel
errnoFailure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 764 of file charset.c.

765 {
766  if (!ps)
767  return -1;
768 
769  char *s = *ps;
770 
771  if (!s || (*s == '\0'))
772  return 0;
773 
774  if (!to || !from)
775  return -1;
776 
777  const char *repls[] = { "\357\277\275", "?", 0 };
778  int rc = 0;
779 
780  iconv_t cd = mutt_ch_iconv_open(to, from, flags);
781  if (cd == (iconv_t) -1)
782  return -1;
783 
784  size_t len;
785  const char *ib = NULL;
786  char *buf = NULL, *ob = NULL;
787  size_t ibl, obl;
788  const char **inrepls = NULL;
789  const char *outrepl = NULL;
790 
791  if (mutt_ch_is_utf8(to))
792  outrepl = "\357\277\275";
793  else if (mutt_ch_is_utf8(from))
794  inrepls = repls;
795  else
796  outrepl = "?";
797 
798  len = strlen(s);
799  ib = s;
800  ibl = len + 1;
801  obl = MB_LEN_MAX * ibl;
802  buf = mutt_mem_malloc(obl + 1);
803  ob = buf;
804 
805  mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
806  iconv_close(cd);
807 
808  *ob = '\0';
809 
810  FREE(ps);
811  *ps = buf;
812 
813  mutt_str_adjust(ps);
814  return rc;
815 }
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:629
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:301
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv fc)

Convert a file's character set.

Parameters
fcFgetConv handle
Return values
numNext character in the converted file
EOFError

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 918 of file charset.c.

919 {
920  if (!fc)
921  return EOF;
922  if (fc->cd == (iconv_t) -1)
923  return fgetc(fc->fp);
924  if (!fc->p)
925  return EOF;
926  if (fc->p < fc->ob)
927  return (unsigned char) *(fc->p)++;
928 
929  /* Try to convert some more */
930  fc->p = fc->bufo;
931  fc->ob = fc->bufo;
932  if (fc->ibl)
933  {
934  size_t obl = sizeof(fc->bufo);
935  iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
936  if (fc->p < fc->ob)
937  return (unsigned char) *(fc->p)++;
938  }
939 
940  /* If we trusted iconv a bit more, we would at this point
941  * ask why it had stopped converting ... */
942 
943  /* Try to read some more */
944  if ((fc->ibl == sizeof(fc->bufi)) ||
945  (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
946  {
947  fc->p = 0;
948  return EOF;
949  }
950  if (fc->ibl)
951  memcpy(fc->bufi, fc->ib, fc->ibl);
952  fc->ib = fc->bufi;
953  fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
954 
955  /* Try harder this time to convert some */
956  if (fc->ibl)
957  {
958  size_t obl = sizeof(fc->bufo);
959  mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
960  fc->inrepls, 0, NULL);
961  if (fc->p < fc->ob)
962  return (unsigned char) *(fc->p)++;
963  }
964 
965  /* Either the file has finished or one of the buffers is too small */
966  fc->p = 0;
967  return EOF;
968 }
char bufi[512]
Definition: charset.h:44
iconv_t cd
Definition: charset.h:43
char bufo[512]
Definition: charset.h:45
size_t ibl
Definition: charset.h:49
FILE * fp
Definition: charset.h:42
char * p
Definition: charset.h:46
const char ** inrepls
Definition: charset.h:50
char * ib
Definition: charset.h:48
char * ob
Definition: charset.h:47
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv **  fc)

Close an fgetconv handle.

Parameters
[out]fcfgetconv handle

Definition at line 898 of file charset.c.

899 {
900  if (!fc || !*fc)
901  return;
902 
903  if ((*fc)->cd != (iconv_t) -1)
904  iconv_close((*fc)->cd);
905  FREE(fc);
906 }
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv* mutt_ch_fgetconv_open ( FILE *  fp,
const char *  from,
const char *  to,
uint8_t  flags 
)

Prepare a file for charset conversion.

Parameters
fpFILE ptr to prepare
fromCurrent character set
toDestination character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptrfgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 868 of file charset.c.

869 {
870  struct FgetConv *fc = NULL;
871  iconv_t cd = (iconv_t) -1;
872 
873  if (from && to)
874  cd = mutt_ch_iconv_open(to, from, flags);
875 
876  if (cd != (iconv_t) -1)
877  {
878  static const char *repls[] = { "\357\277\275", "?", 0 };
879 
880  fc = mutt_mem_malloc(sizeof(struct FgetConv));
881  fc->p = fc->bufo;
882  fc->ob = fc->bufo;
883  fc->ib = fc->bufi;
884  fc->ibl = 0;
885  fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
886  }
887  else
888  fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
889  fc->fp = fp;
890  fc->cd = cd;
891  return fc;
892 }
A dummy converter.
Definition: charset.h:57
Cursor for converting a file's encoding.
Definition: charset.h:41
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char* mutt_ch_fgetconvs ( char *  buf,
size_t  buflen,
struct FgetConv fc 
)

Convert a file's charset into a string buffer.

Parameters
bufBuffer for result
buflenLength of buffer
fcFgetConv handle
Return values
ptrSuccess, result buffer
NULLError

Read a file into a buffer, converting the character set as it goes.

Definition at line 980 of file charset.c.

981 {
982  if (!buf)
983  return NULL;
984 
985  size_t r;
986  for (r = 0; (r + 1) < buflen;)
987  {
988  const int c = mutt_ch_fgetconv(fc);
989  if (c == EOF)
990  break;
991  buf[r++] = (char) c;
992  if (c == '\n')
993  break;
994  }
995  buf[r] = '\0';
996 
997  if (r > 0)
998  return buf;
999 
1000  return NULL;
1001 }
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:918
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_default_charset()

char* mutt_ch_get_default_charset ( void  )

Get the default character set.

Return values
ptrName of the default character set
Warning
This returns a pointer to a static buffer. Do not free it.

Definition at line 443 of file charset.c.

444 {
445  static char fcharset[128];
446  const char *const c_assumed_charset =
447  cs_subset_string(NeoMutt->sub, "assumed_charset");
448  const char *c = c_assumed_charset;
449  const char *c1 = NULL;
450 
451  if (c)
452  {
453  c1 = strchr(c, ':');
454 
455  size_t copysize;
456  if (c1)
457  copysize = MIN((c1 - c + 1), sizeof(fcharset));
458  else
459  copysize = sizeof(fcharset);
460  mutt_str_copy(fcharset, c, copysize);
461  return fcharset;
462  }
463  return strcpy(fcharset, "us-ascii");
464 }
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char* mutt_ch_get_langinfo_charset ( void  )

Get the user's choice of character set.

Return values
ptrCharset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 473 of file charset.c.

474 {
475  char buf[1024] = { 0 };
476 
477  mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
478 
479  if (buf[0] != '\0')
480  return mutt_str_dup(buf);
481 
482  return mutt_str_dup("iso-8859-1");
483 }
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:181
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv ( iconv_t  cd,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft,
const char **  inrepls,
const char *  outrepl,
int *  iconverrno 
)

Change the encoding of a string.

Parameters
[in]cdIconv conversion descriptor
[in,out]inbufBuffer to convert
[in,out]inbytesleftLength of buffer to convert
[in,out]outbufBuffer for the result
[in,out]outbytesleftLength of result buffer
[in]inreplsInput replacement characters
[in]outreplOutput replacement characters
[out]iconverrnoErrno if iconv() fails, 0 if it succeeds
Return values
numCharacters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 629 of file charset.c.

632 {
633  size_t rc = 0;
634  const char *ib = *inbuf;
635  size_t ibl = *inbytesleft;
636  char *ob = *outbuf;
637  size_t obl = *outbytesleft;
638 
639  while (true)
640  {
641  errno = 0;
642  const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
643  if (ret1 != (size_t) -1)
644  rc += ret1;
645  if (iconverrno)
646  *iconverrno = errno;
647 
648  if (ibl && obl && (errno == EILSEQ))
649  {
650  if (inrepls)
651  {
652  /* Try replacing the input */
653  const char **t = NULL;
654  for (t = inrepls; *t; t++)
655  {
656  const char *ib1 = *t;
657  size_t ibl1 = strlen(*t);
658  char *ob1 = ob;
659  size_t obl1 = obl;
660  iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
661  if (ibl1 == 0)
662  {
663  ib++;
664  ibl--;
665  ob = ob1;
666  obl = obl1;
667  rc++;
668  break;
669  }
670  }
671  if (*t)
672  continue;
673  }
674  /* Replace the output */
675  if (!outrepl)
676  outrepl = "?";
677  iconv(cd, NULL, NULL, &ob, &obl);
678  if (obl)
679  {
680  int n = strlen(outrepl);
681  if (n > obl)
682  {
683  outrepl = "?";
684  n = 1;
685  }
686  memcpy(ob, outrepl, n);
687  ib++;
688  ibl--;
689  ob += n;
690  obl -= n;
691  rc++;
692  iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
693  continue;
694  }
695  }
696  *inbuf = ib;
697  *inbytesleft = ibl;
698  *outbuf = ob;
699  *outbytesleft = obl;
700  return rc;
701  }
702 }
#define EILSEQ
Definition: charset.c:51
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char* mutt_ch_iconv_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'iconv-hook' matches)
NULLNo matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 713 of file charset.c.

714 {
716 }
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open ( const char *  tocode,
const char *  fromcode,
uint8_t  flags 
)

Set up iconv for conversions.

Parameters
tocodeCurrent character set
fromcodeTarget character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptriconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Note
By design charset-hooks should never be, and are never, applied to tocode.
The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 576 of file charset.c.

577 {
578  char tocode1[128];
579  char fromcode1[128];
580  const char *tocode2 = NULL, *fromcode2 = NULL;
581  const char *tmp = NULL;
582 
583  iconv_t cd;
584 
585  /* transform to MIME preferred charset names */
586  mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
587  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
588 
589  /* maybe apply charset-hooks and recanonicalise fromcode,
590  * but only when caller asked us to sanitize a potentially wrong
591  * charset name incoming from the wild exterior. */
592  if (flags & MUTT_ICONV_HOOK_FROM)
593  {
594  tmp = mutt_ch_charset_lookup(fromcode1);
595  if (tmp)
596  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
597  }
598 
599  /* always apply iconv-hooks to suit system's iconv tastes */
600  tocode2 = mutt_ch_iconv_lookup(tocode1);
601  tocode2 = tocode2 ? tocode2 : tocode1;
602  fromcode2 = mutt_ch_iconv_lookup(fromcode1);
603  fromcode2 = fromcode2 ? fromcode2 : fromcode1;
604 
605  /* call system iconv with names it appreciates */
606  cd = iconv_open(tocode2, fromcode2);
607  if (cd != (iconv_t) -1)
608  return cd;
609 
610  return (iconv_t) -1;
611 }
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:549
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:713
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add ( enum LookupType  type,
const char *  pat,
const char *  replace,
struct Buffer err 
)

Add a new character set lookup.

Parameters
typeType of character set, e.g. MUTT_LOOKUP_CHARSET
patPattern to match
replaceReplacement string
errBuffer for error message
Return values
trueLookup added to list
falseRegex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 496 of file charset.c.

498 {
499  if (!pat || !replace)
500  return false;
501 
502  regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
503  int rc = REG_COMP(rx, pat, REG_ICASE);
504  if (rc != 0)
505  {
506  regerror(rc, rx, err->data, err->dsize);
507  FREE(&rx);
508  return false;
509  }
510 
511  struct Lookup *l = lookup_new();
512  l->type = type;
513  l->replacement = mutt_str_dup(replace);
514  l->regex.pattern = mutt_str_dup(pat);
515  l->regex.regex = rx;
516  l->regex.pat_not = false;
517 
518  TAILQ_INSERT_TAIL(&Lookups, l, entries);
519 
520  return true;
521 }
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
static struct LookupList Lookups
Definition: charset.c:78
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:247
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:54
size_t dsize
Length of data.
Definition: buffer.h:37
char * data
Pointer to data.
Definition: buffer.h:35
Regex to String lookup table.
Definition: charset.c:70
char * replacement
Alternative charset to use.
Definition: charset.c:73
enum LookupType type
Lookup type.
Definition: charset.c:71
struct Regex regex
Regular expression.
Definition: charset.c:72
char * pattern
printable version
Definition: regex3.h:91
bool pat_not
do not match
Definition: regex3.h:93
regex_t * regex
compiled expression
Definition: regex3.h:92
+ Here is the call graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void  )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 528 of file charset.c.

529 {
530  struct Lookup *l = NULL;
531  struct Lookup *tmp = NULL;
532 
533  TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
534  {
535  TAILQ_REMOVE(&Lookups, l, entries);
536  lookup_free(&l);
537  }
538 }
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:256
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
+ Here is the call graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char *  charset)

Update the records for a new character set.

Parameters
charsetNew character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note
This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1013 of file charset.c.

1014 {
1015  char buf[256];
1016 
1017  mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1018 
1019  if (mutt_ch_is_utf8(buf))
1020  {
1021  CharsetIsUtf8 = true;
1022  ReplacementChar = 0xfffd; /* replacement character */
1023  }
1024  else
1025  {
1026  CharsetIsUtf8 = false;
1027  ReplacementChar = '?';
1028  }
1029 
1030 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1031  bind_textdomain_codeset(PACKAGE, buf);
1032 #endif
1033 }
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:62
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:57
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ CharsetIsUtf8

bool CharsetIsUtf8
extern

Is the user's current character set utf-8?

Definition at line 62 of file charset.c.

◆ ReplacementChar

wchar_t ReplacementChar
extern

When a Unicode character can't be displayed, use this instead.

Definition at line 57 of file charset.c.