NeoMutt  2022-04-29-145-g9b6a0e
Teaching an old dog new tricks
DOXYGEN
charset.h File Reference

Conversion between different character encodings. More...

#include <iconv.h>
#include <stdbool.h>
#include <stdint.h>
#include <wchar.h>
+ Include dependency graph for charset.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  FgetConv
 Cursor for converting a file's encoding. More...
 
struct  FgetConvNot
 A dummy converter. More...
 

Macros

#define MUTT_ICONV_NO_FLAGS   0
 No flags are set. More...
 
#define MUTT_ICONV_HOOK_FROM   1
 apply charset-hooks to fromcode More...
 
#define mutt_ch_is_utf8(str)   mutt_ch_chscmp(str, "utf-8")
 
#define mutt_ch_is_us_ascii(str)   mutt_ch_chscmp(str, "us-ascii")
 

Enumerations

enum  LookupType { MUTT_LOOKUP_CHARSET , MUTT_LOOKUP_ICONV }
 Types of character set lookups. More...
 

Functions

void mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name)
 Canonicalise the charset of a string. More...
 
const char * mutt_ch_charset_lookup (const char *chs)
 Look for a replacement character set. More...
 
int mutt_ch_check (const char *s, size_t slen, const char *from, const char *to)
 Check whether a string can be converted between encodings. More...
 
bool mutt_ch_check_charset (const char *cs, bool strict)
 Does iconv understand a character set? More...
 
char * mutt_ch_choose (const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
 Figure the best charset to encode a string. More...
 
bool mutt_ch_chscmp (const char *cs1, const char *cs2)
 Are the names of two character sets equivalent? More...
 
int mutt_ch_convert_nonmime_string (char **ps)
 Try to convert a string using a list of character sets. More...
 
int mutt_ch_convert_string (char **ps, const char *from, const char *to, uint8_t flags)
 Convert a string between encodings. More...
 
int mutt_ch_fgetconv (struct FgetConv *fc)
 Convert a file's character set. More...
 
void mutt_ch_fgetconv_close (struct FgetConv **fc)
 Close an fgetconv handle. More...
 
struct FgetConvmutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, uint8_t flags)
 Prepare a file for charset conversion. More...
 
char * mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc)
 Convert a file's charset into a string buffer. More...
 
char * mutt_ch_get_default_charset (void)
 Get the default character set. More...
 
char * mutt_ch_get_langinfo_charset (void)
 Get the user's choice of character set. More...
 
size_t mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
 Change the encoding of a string. More...
 
const char * mutt_ch_iconv_lookup (const char *chs)
 Look for a replacement character set. More...
 
iconv_t mutt_ch_iconv_open (const char *tocode, const char *fromcode, uint8_t flags)
 Set up iconv for conversions. More...
 
bool mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
 Add a new character set lookup. More...
 
void mutt_ch_lookup_remove (void)
 Remove all the character set lookups. More...
 
void mutt_ch_set_charset (const char *charset)
 Update the records for a new character set. More...
 

Variables

bool CharsetIsUtf8
 Is the user's current character set utf-8? More...
 
wchar_t ReplacementChar
 When a Unicode character can't be displayed, use this instead. More...
 

Detailed Description

Conversion between different character encodings.

Authors
  • Thomas Roessler

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.h.

Macro Definition Documentation

◆ MUTT_ICONV_NO_FLAGS

#define MUTT_ICONV_NO_FLAGS   0

No flags are set.

Definition at line 71 of file charset.h.

◆ MUTT_ICONV_HOOK_FROM

#define MUTT_ICONV_HOOK_FROM   1

apply charset-hooks to fromcode

Definition at line 72 of file charset.h.

◆ mutt_ch_is_utf8

#define mutt_ch_is_utf8 (   str)    mutt_ch_chscmp(str, "utf-8")

Definition at line 95 of file charset.h.

◆ mutt_ch_is_us_ascii

#define mutt_ch_is_us_ascii (   str)    mutt_ch_chscmp(str, "us-ascii")

Definition at line 96 of file charset.h.

Enumeration Type Documentation

◆ LookupType

enum LookupType

Types of character set lookups.

Enumerator
MUTT_LOOKUP_CHARSET 

Alias for another character set.

MUTT_LOOKUP_ICONV 

Character set conversion.

Definition at line 65 of file charset.h.

66 {
69 };
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:68
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:67

Function Documentation

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset ( char *  buf,
size_t  buflen,
const char *  name 
)

Canonicalise the charset of a string.

Parameters
bufBuffer for canonical character set name
buflenLength of buffer
nameName to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 351 of file charset.c.

352 {
353  if (!buf || !name)
354  return;
355 
356  char in[1024], scratch[1024 + 10];
357 
358  mutt_str_copy(in, name, sizeof(in));
359  char *ext = strchr(in, '/');
360  if (ext)
361  *ext++ = '\0';
362 
363  if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
364  {
365  mutt_str_copy(buf, "utf-8", buflen);
366  goto out;
367  }
368 
369  /* catch some common iso-8859-something misspellings */
370  size_t plen;
371  if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
372  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
373  else if ((plen = mutt_istr_startswith(in, "8859-")))
374  snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
375  else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
376  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
377  else if ((plen = mutt_istr_startswith(in, "iso8859-")))
378  snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
379  else
380  mutt_str_copy(scratch, in, sizeof(scratch));
381 
382  for (size_t i = 0; PreferredMimeNames[i].key; i++)
383  {
384  if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
385  {
386  mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
387  goto out;
388  }
389  }
390 
391  mutt_str_copy(buf, scratch, buflen);
392 
393  /* for cosmetics' sake, transform to lowercase. */
394  for (char *p = buf; *p; p++)
395  *p = tolower(*p);
396 
397 out:
398  if (ext && *ext)
399  {
400  mutt_str_cat(buf, buflen, "/");
401  mutt_str_cat(buf, buflen, ext);
402  }
403 }
const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:99
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:796
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:629
char * mutt_str_cat(char *buf, size_t buflen, const char *s)
Concatenate two strings.
Definition: string.c:265
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:239
static size_t plen
Length of cached packet.
Definition: pgppacket.c:39
const char * key
Definition: charset.c:85
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char* mutt_ch_charset_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'charset-hook' matches)
NULLNo matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 537 of file charset.c.

538 {
540 }
static char * chs
Definition: gnupgparse.c:73
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:281
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check ( const char *  s,
size_t  slen,
const char *  from,
const char *  to 
)

Check whether a string can be converted between encodings.

Parameters
[in]sString to check
[in]slenLength of the string to check
[in]fromCurrent character set
[in]toTarget character set
Return values
0Success
-1Error in iconv_open()
>0Errno as set by iconv()

Definition at line 716 of file charset.c.

717 {
718  if (!s || !from || !to)
719  return -1;
720 
721  int rc = 0;
722  iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
723  if (cd == (iconv_t) -1)
724  return -1;
725 
726  size_t outlen = MB_LEN_MAX * slen;
727  char *out = mutt_mem_malloc(outlen + 1);
728  char *saved_out = out;
729 
730  const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
731  if (convlen == -1)
732  rc = errno;
733 
734  FREE(&saved_out);
735  iconv_close(cd);
736  return rc;
737 }
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
#define FREE(x)
Definition: memory.h:43
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:564
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
+ Here is the call graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset ( const char *  cs,
bool  strict 
)

Does iconv understand a character set?

Parameters
csCharacter set to check
strictCheck strictly by using iconv
Return values
trueCharacter set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 817 of file charset.c.

818 {
819  if (!cs)
820  return false;
821 
822  if (mutt_ch_is_utf8(cs))
823  return true;
824 
825  if (!strict)
826  {
827  for (int i = 0; PreferredMimeNames[i].key; i++)
828  {
829  if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
830  mutt_istr_equal(PreferredMimeNames[i].pref, cs))
831  {
832  return true;
833  }
834  }
835  }
836 
837  iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
838  if (cd != (iconv_t) (-1))
839  {
840  iconv_close(cd);
841  return true;
842  }
843 
844  return false;
845 }
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_choose()

char* mutt_ch_choose ( const char *  fromcode,
const struct Slist charsets,
const char *  u,
size_t  ulen,
char **  d,
size_t *  dlen 
)

Figure the best charset to encode a string.

Parameters
[in]fromcodeOriginal charset of the string
[in]charsetsList of potential charsets to use
[in]uString to encode
[in]ulenLength of the string to encode
[out]dIf not NULL, point it to the converted string
[out]dlenIf not NULL, point it to the length of the d string
Return values
ptrBest performing charset
NULLNone could be found

Definition at line 1035 of file charset.c.

1037 {
1038  if (!fromcode || !charsets)
1039  return NULL;
1040 
1041  char *e = NULL, *tocode = NULL;
1042  size_t elen = 0, bestn = 0;
1043 
1044  const struct ListNode *np = NULL;
1045  STAILQ_FOREACH(np, &charsets->head, entries)
1046  {
1047  char *t = mutt_str_dup(np->data);
1048  if (!t)
1049  continue;
1050 
1051  size_t n = mutt_str_len(t);
1052  char *s = mutt_strn_dup(u, ulen);
1053  const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1054  mutt_ch_check(s, ulen, fromcode, t);
1055  if (rc)
1056  {
1057  FREE(&t);
1058  FREE(&s);
1059  continue;
1060  }
1061  size_t slen = mutt_str_len(s);
1062 
1063  if (!tocode || (n < bestn))
1064  {
1065  bestn = n;
1066  FREE(&tocode);
1067  tocode = t;
1068  if (d)
1069  {
1070  FREE(&e);
1071  e = s;
1072  }
1073  else
1074  FREE(&s);
1075  elen = slen;
1076  }
1077  else
1078  {
1079  FREE(&t);
1080  FREE(&s);
1081  }
1082  }
1083  if (tocode)
1084  {
1085  if (d)
1086  *d = e;
1087  if (dlen)
1088  *dlen = elen;
1089 
1090  char canonical_buf[1024];
1091  mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1092  mutt_str_replace(&tocode, canonical_buf);
1093  }
1094  return tocode;
1095 }
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:351
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:752
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:716
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:428
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:250
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:544
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:326
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
A List node for strings.
Definition: list.h:35
char * data
String.
Definition: list.h:36
struct ListHead head
List containing values.
Definition: slist.h:48
+ Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp ( const char *  cs1,
const char *  cs2 
)

Are the names of two character sets equivalent?

Parameters
cs1First character set
cs2Second character set
Return values
trueNames are equivalent
falseNames differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 417 of file charset.c.

418 {
419  if (!cs1 || !cs2)
420  return false;
421 
422  char buf[256];
423 
424  mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
425 
426  int len1 = mutt_str_len(buf);
427  int len2 = mutt_str_len(cs2);
428 
429  return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
430  ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
431 }
#define MIN(a, b)
Definition: memory.h:31
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:501
+ Here is the call graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string ( char **  ps)

Try to convert a string using a list of character sets.

Parameters
[in,out]psString to be converted
Return values
0Success
-1Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 307 of file charset.c.

308 {
309  if (!ps)
310  return -1;
311 
312  char *u = *ps;
313  const size_t ulen = mutt_str_len(u);
314  if (ulen == 0)
315  return 0;
316 
317  const struct Slist *const c_assumed_charset = cs_subset_slist(NeoMutt->sub, "assumed_charset");
318  const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
319  const struct ListNode *np = NULL;
320  STAILQ_FOREACH(np, &c_assumed_charset->head, entries)
321  {
322  char const *c = np->data;
323  size_t n = mutt_str_len(c);
324  char *fromcode = mutt_mem_malloc(n + 1);
325  mutt_str_copy(fromcode, c, n + 1);
326  char *s = mutt_strn_dup(u, ulen);
327  int m = mutt_ch_convert_string(&s, fromcode, c_charset, MUTT_ICONV_NO_FLAGS);
328  FREE(&fromcode);
329  if (m == 0)
330  {
331  FREE(ps);
332  *ps = s;
333  return 0;
334  }
335  FREE(&s);
336  }
338  c_charset, MUTT_ICONV_HOOK_FROM);
339  return -1;
340 }
const struct Slist * cs_subset_slist(const struct ConfigSubset *sub, const char *name)
Get a string-list config item by name.
Definition: helpers.c:268
const char * cs_subset_string(const struct ConfigSubset *sub, const char *name)
Get a string config item by name.
Definition: helpers.c:317
char * mutt_ch_get_default_charset(void)
Get the default character set.
Definition: charset.c:439
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:72
Container for Accounts, Notifications.
Definition: neomutt.h:37
struct ConfigSubset * sub
Inherited config items.
Definition: neomutt.h:39
String list.
Definition: slist.h:47
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string ( char **  ps,
const char *  from,
const char *  to,
uint8_t  flags 
)

Convert a string between encodings.

Parameters
[in,out]psString to convert
[in]fromCurrent character set
[in]toTarget character set
[in]flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
0Success
-1Invalid arguments or failure to open an iconv channel
errnoFailure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 752 of file charset.c.

753 {
754  if (!ps)
755  return -1;
756 
757  char *s = *ps;
758 
759  if (!s || (*s == '\0'))
760  return 0;
761 
762  if (!to || !from)
763  return -1;
764 
765  const char *repls[] = { "\357\277\275", "?", 0 };
766  int rc = 0;
767 
768  iconv_t cd = mutt_ch_iconv_open(to, from, flags);
769  if (cd == (iconv_t) -1)
770  return -1;
771 
772  const char **inrepls = NULL;
773  const char *outrepl = NULL;
774 
775  if (mutt_ch_is_utf8(to))
776  outrepl = "\357\277\275";
777  else if (mutt_ch_is_utf8(from))
778  inrepls = repls;
779  else
780  outrepl = "?";
781 
782  const char *ib = s;
783  size_t ibl = strlen(s);
784  if (ibl >= (SIZE_MAX / MB_LEN_MAX))
785  {
786  iconv_close(cd);
787  return -1;
788  }
789  size_t obl = MB_LEN_MAX * ibl;
790  char *buf = mutt_mem_malloc(obl + 1);
791  char *ob = buf;
792 
793  mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
794  iconv(cd, 0, 0, &ob, &obl);
795  iconv_close(cd);
796 
797  *ob = '\0';
798 
799  FREE(ps);
800  *ps = buf;
801 
802  mutt_str_adjust(ps);
803  return rc;
804 }
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:617
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:370
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv fc)

Convert a file's character set.

Parameters
fcFgetConv handle
Return values
numNext character in the converted file
EOFError

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 907 of file charset.c.

908 {
909  if (!fc)
910  return EOF;
911  if (fc->cd == (iconv_t) -1)
912  return fgetc(fc->fp);
913  if (!fc->p)
914  return EOF;
915  if (fc->p < fc->ob)
916  return (unsigned char) *(fc->p)++;
917 
918  /* Try to convert some more */
919  fc->p = fc->bufo;
920  fc->ob = fc->bufo;
921  if (fc->ibl)
922  {
923  size_t obl = sizeof(fc->bufo);
924  iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
925  if (fc->p < fc->ob)
926  return (unsigned char) *(fc->p)++;
927  }
928 
929  /* If we trusted iconv a bit more, we would at this point
930  * ask why it had stopped converting ... */
931 
932  /* Try to read some more */
933  if ((fc->ibl == sizeof(fc->bufi)) ||
934  (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
935  {
936  fc->p = 0;
937  return EOF;
938  }
939  if (fc->ibl)
940  memcpy(fc->bufi, fc->ib, fc->ibl);
941  fc->ib = fc->bufi;
942  fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
943 
944  /* Try harder this time to convert some */
945  if (fc->ibl)
946  {
947  size_t obl = sizeof(fc->bufo);
948  mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
949  fc->inrepls, 0, NULL);
950  if (fc->p < fc->ob)
951  return (unsigned char) *(fc->p)++;
952  }
953 
954  /* Either the file has finished or one of the buffers is too small */
955  fc->p = 0;
956  return EOF;
957 }
char bufi[512]
Definition: charset.h:44
iconv_t cd
Definition: charset.h:43
char bufo[512]
Definition: charset.h:45
size_t ibl
Definition: charset.h:49
FILE * fp
Definition: charset.h:42
char * p
Definition: charset.h:46
const char ** inrepls
Definition: charset.h:50
char * ib
Definition: charset.h:48
char * ob
Definition: charset.h:47
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv **  fc)

Close an fgetconv handle.

Parameters
[out]fcfgetconv handle

Definition at line 887 of file charset.c.

888 {
889  if (!fc || !*fc)
890  return;
891 
892  if ((*fc)->cd != (iconv_t) -1)
893  iconv_close((*fc)->cd);
894  FREE(fc);
895 }
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv* mutt_ch_fgetconv_open ( FILE *  fp,
const char *  from,
const char *  to,
uint8_t  flags 
)

Prepare a file for charset conversion.

Parameters
fpFILE ptr to prepare
fromCurrent character set
toDestination character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptrfgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 857 of file charset.c.

858 {
859  struct FgetConv *fc = NULL;
860  iconv_t cd = (iconv_t) -1;
861 
862  if (from && to)
863  cd = mutt_ch_iconv_open(to, from, flags);
864 
865  if (cd != (iconv_t) -1)
866  {
867  static const char *repls[] = { "\357\277\275", "?", 0 };
868 
869  fc = mutt_mem_malloc(sizeof(struct FgetConv));
870  fc->p = fc->bufo;
871  fc->ob = fc->bufo;
872  fc->ib = fc->bufi;
873  fc->ibl = 0;
874  fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
875  }
876  else
877  fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
878  fc->fp = fp;
879  fc->cd = cd;
880  return fc;
881 }
A dummy converter.
Definition: charset.h:57
Cursor for converting a file's encoding.
Definition: charset.h:41
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char* mutt_ch_fgetconvs ( char *  buf,
size_t  buflen,
struct FgetConv fc 
)

Convert a file's charset into a string buffer.

Parameters
bufBuffer for result
buflenLength of buffer
fcFgetConv handle
Return values
ptrSuccess, result buffer
NULLError

Read a file into a buffer, converting the character set as it goes.

Definition at line 969 of file charset.c.

970 {
971  if (!buf)
972  return NULL;
973 
974  size_t r;
975  for (r = 0; (r + 1) < buflen;)
976  {
977  const int c = mutt_ch_fgetconv(fc);
978  if (c == EOF)
979  break;
980  buf[r++] = (char) c;
981  if (c == '\n')
982  break;
983  }
984  buf[r] = '\0';
985 
986  if (r > 0)
987  return buf;
988 
989  return NULL;
990 }
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:907
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_default_charset()

char* mutt_ch_get_default_charset ( void  )

Get the default character set.

Return values
ptrName of the default character set
Warning
This returns a pointer to a static buffer. Do not free it.

Definition at line 439 of file charset.c.

440 {
441  static char fcharset[128];
442  const char *c = NULL;
443  const struct Slist *const c_assumed_charset = cs_subset_slist(NeoMutt->sub, "assumed_charset");
444 
445  if (c_assumed_charset && (c_assumed_charset->count > 0))
446  c = STAILQ_FIRST(&c_assumed_charset->head)->data;
447  else
448  c = "us-ascii";
449 
450  mutt_str_copy(fcharset, c, sizeof(fcharset));
451  return fcharset;
452 }
#define STAILQ_FIRST(head)
Definition: queue.h:350
size_t count
Number of values in list.
Definition: slist.h:49
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char* mutt_ch_get_langinfo_charset ( void  )

Get the user's choice of character set.

Return values
ptrCharset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 461 of file charset.c.

462 {
463  char buf[1024] = { 0 };
464 
465  mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
466 
467  if (buf[0] != '\0')
468  return mutt_str_dup(buf);
469 
470  return mutt_str_dup("iso-8859-1");
471 }
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv ( iconv_t  cd,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft,
const char **  inrepls,
const char *  outrepl,
int *  iconverrno 
)

Change the encoding of a string.

Parameters
[in]cdIconv conversion descriptor
[in,out]inbufBuffer to convert
[in,out]inbytesleftLength of buffer to convert
[in,out]outbufBuffer for the result
[in,out]outbytesleftLength of result buffer
[in]inreplsInput replacement characters
[in]outreplOutput replacement characters
[out]iconverrnoErrno if iconv() fails, 0 if it succeeds
Return values
numCharacters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 617 of file charset.c.

620 {
621  size_t rc = 0;
622  const char *ib = *inbuf;
623  size_t ibl = *inbytesleft;
624  char *ob = *outbuf;
625  size_t obl = *outbytesleft;
626 
627  while (true)
628  {
629  errno = 0;
630  const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
631  if (ret1 != (size_t) -1)
632  rc += ret1;
633  if (iconverrno)
634  *iconverrno = errno;
635 
636  if (ibl && obl && (errno == EILSEQ))
637  {
638  if (inrepls)
639  {
640  /* Try replacing the input */
641  const char **t = NULL;
642  for (t = inrepls; *t; t++)
643  {
644  const char *ib1 = *t;
645  size_t ibl1 = strlen(*t);
646  char *ob1 = ob;
647  size_t obl1 = obl;
648  iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
649  if (ibl1 == 0)
650  {
651  ib++;
652  ibl--;
653  ob = ob1;
654  obl = obl1;
655  rc++;
656  break;
657  }
658  }
659  if (*t)
660  continue;
661  }
662  /* Replace the output */
663  if (!outrepl)
664  outrepl = "?";
665  iconv(cd, NULL, NULL, &ob, &obl);
666  if (obl)
667  {
668  int n = strlen(outrepl);
669  if (n > obl)
670  {
671  outrepl = "?";
672  n = 1;
673  }
674  memcpy(ob, outrepl, n);
675  ib++;
676  ibl--;
677  ob += n;
678  obl -= n;
679  rc++;
680  iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
681  continue;
682  }
683  }
684  *inbuf = ib;
685  *inbytesleft = ibl;
686  *outbuf = ob;
687  *outbytesleft = obl;
688  return rc;
689  }
690 }
#define EILSEQ
Definition: charset.c:51
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char* mutt_ch_iconv_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'iconv-hook' matches)
NULLNo matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 701 of file charset.c.

702 {
704 }
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open ( const char *  tocode,
const char *  fromcode,
uint8_t  flags 
)

Set up iconv for conversions.

Parameters
tocodeCurrent character set
fromcodeTarget character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptriconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Note
By design charset-hooks should never be, and are never, applied to tocode.
The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 564 of file charset.c.

565 {
566  char tocode1[128];
567  char fromcode1[128];
568  const char *tocode2 = NULL, *fromcode2 = NULL;
569  const char *tmp = NULL;
570 
571  iconv_t cd;
572 
573  /* transform to MIME preferred charset names */
574  mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
575  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
576 
577  /* maybe apply charset-hooks and recanonicalise fromcode,
578  * but only when caller asked us to sanitize a potentially wrong
579  * charset name incoming from the wild exterior. */
580  if (flags & MUTT_ICONV_HOOK_FROM)
581  {
582  tmp = mutt_ch_charset_lookup(fromcode1);
583  if (tmp)
584  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
585  }
586 
587  /* always apply iconv-hooks to suit system's iconv tastes */
588  tocode2 = mutt_ch_iconv_lookup(tocode1);
589  tocode2 = tocode2 ? tocode2 : tocode1;
590  fromcode2 = mutt_ch_iconv_lookup(fromcode1);
591  fromcode2 = fromcode2 ? fromcode2 : fromcode1;
592 
593  /* call system iconv with names it appreciates */
594  cd = iconv_open(tocode2, fromcode2);
595  if (cd != (iconv_t) -1)
596  return cd;
597 
598  return (iconv_t) -1;
599 }
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:537
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:701
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add ( enum LookupType  type,
const char *  pat,
const char *  replace,
struct Buffer err 
)

Add a new character set lookup.

Parameters
typeType of character set, e.g. MUTT_LOOKUP_CHARSET
patPattern to match
replaceReplacement string
errBuffer for error message
Return values
trueLookup added to list
falseRegex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 484 of file charset.c.

486 {
487  if (!pat || !replace)
488  return false;
489 
490  regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
491  int rc = REG_COMP(rx, pat, REG_ICASE);
492  if (rc != 0)
493  {
494  regerror(rc, rx, err->data, err->dsize);
495  FREE(&rx);
496  return false;
497  }
498 
499  struct Lookup *l = lookup_new();
500  l->type = type;
501  l->replacement = mutt_str_dup(replace);
502  l->regex.pattern = mutt_str_dup(pat);
503  l->regex.regex = rx;
504  l->regex.pat_not = false;
505 
506  TAILQ_INSERT_TAIL(&Lookups, l, entries);
507 
508  return true;
509 }
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
static struct LookupList Lookups
Definition: charset.c:78
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:247
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:53
size_t dsize
Length of data.
Definition: buffer.h:37
char * data
Pointer to data.
Definition: buffer.h:35
Regex to String lookup table.
Definition: charset.c:70
char * replacement
Alternative charset to use.
Definition: charset.c:73
enum LookupType type
Lookup type.
Definition: charset.c:71
struct Regex regex
Regular expression.
Definition: charset.c:72
char * pattern
printable version
Definition: regex3.h:90
bool pat_not
do not match
Definition: regex3.h:92
regex_t * regex
compiled expression
Definition: regex3.h:91
+ Here is the call graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void  )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 516 of file charset.c.

517 {
518  struct Lookup *l = NULL;
519  struct Lookup *tmp = NULL;
520 
521  TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
522  {
523  TAILQ_REMOVE(&Lookups, l, entries);
524  lookup_free(&l);
525  }
526 }
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:256
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
+ Here is the call graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char *  charset)

Update the records for a new character set.

Parameters
charsetNew character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note
This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1002 of file charset.c.

1003 {
1004  char buf[256];
1005 
1006  mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1007 
1008  if (mutt_ch_is_utf8(buf))
1009  {
1010  CharsetIsUtf8 = true;
1011  ReplacementChar = 0xfffd; /* replacement character */
1012  }
1013  else
1014  {
1015  CharsetIsUtf8 = false;
1016  ReplacementChar = '?';
1017  }
1018 
1019 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1020  bind_textdomain_codeset(PACKAGE, buf);
1021 #endif
1022 }
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:62
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:57
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ CharsetIsUtf8

bool CharsetIsUtf8
extern

Is the user's current character set utf-8?

Definition at line 62 of file charset.c.

◆ ReplacementChar

wchar_t ReplacementChar
extern

When a Unicode character can't be displayed, use this instead.

Definition at line 57 of file charset.c.