NeoMutt  2023-05-17-16-g61469c
Teaching an old dog new tricks
DOXYGEN
lib.h File Reference

Conversion between different character encodings. More...

#include <wchar.h>
#include "config/lib.h"
+ Include dependency graph for lib.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

size_t mutt_convert_file_from_to (FILE *fp, const struct Slist *fromcodes, const struct Slist *tocodes, char **fromcode, char **tocode, struct Content *info)
 Convert a file between encodings. More...
 
size_t mutt_convert_file_to (FILE *fp, const char *fromcode, struct Slist const *const tocodes, int *tocode, struct Content *info)
 Change the encoding of a file. More...
 
struct Contentmutt_get_content_info (const char *fname, struct Body *b, struct ConfigSubset *sub)
 Analyze file to determine MIME encoding to use. More...
 
void mutt_update_content_info (struct Content *info, struct ContentState *s, char *buf, size_t buflen)
 Cache some info about an email. More...
 

Detailed Description

Conversion between different character encodings.

Authors
  • Michal Siedlaczek

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file lib.h.

Function Documentation

◆ mutt_convert_file_from_to()

size_t mutt_convert_file_from_to ( FILE *  fp,
const struct Slist fromcodes,
const struct Slist tocodes,
char **  fromcode,
char **  tocode,
struct Content info 
)

Convert a file between encodings.

Parameters
[in]fpFile to read from
[in]fromcodesCharsets to try converting FROM
[in]tocodesCharsets to try converting TO
[out]fromcodeFrom charset selected
[out]tocodeTo charset selected
[out]infoInfo about the file
Return values
numCharacters converted
ICONV_ILLEGAL_SEQError (as a size_t)

Find the first of the fromcodes that gives a valid conversion and the best charset conversion of the file into one of the tocodes. If successful, set *fromcode and *tocode to dynamically allocated strings, set Content *info, and return the number of characters converted inexactly. If no conversion was possible, return -1.

Definition at line 212 of file convert.c.

215{
216 char **tcode = NULL;
217 size_t rc;
218 int cn;
219 struct ListNode *np = NULL;
220
221 /* Copy them */
222 tcode = mutt_mem_calloc(tocodes->count, sizeof(char *));
223 np = NULL;
224 cn = 0;
225 STAILQ_FOREACH(np, &tocodes->head, entries)
226 {
227 tcode[cn++] = mutt_str_dup(np->data);
228 }
229
231 np = NULL;
232 cn = 0;
233 STAILQ_FOREACH(np, &fromcodes->head, entries)
234 {
235 /* Try each fromcode in turn */
236 rc = mutt_convert_file_to(fp, np->data, tocodes, &cn, info);
237 if (rc != ICONV_ILLEGAL_SEQ)
238 {
239 *fromcode = np->data;
240 *tocode = tcode[cn];
241 tcode[cn] = 0;
242 break;
243 }
244 }
245
246 /* Free memory */
247 for (cn = 0; cn < tocodes->count; cn++)
248 FREE(&tcode[cn]);
249
250 FREE(&tcode);
251
252 return rc;
253}
size_t mutt_convert_file_to(FILE *fp, const char *fromcode, struct Slist const *const tocodes, int *tocode, struct Content *info)
Change the encoding of a file.
Definition: convert.c:63
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
#define FREE(x)
Definition: memory.h:43
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition: charset.h:103
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:251
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
A List node for strings.
Definition: list.h:35
char * data
String.
Definition: list.h:36
struct ListHead head
List containing values.
Definition: slist.h:48
size_t count
Number of values in list.
Definition: slist.h:49
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_convert_file_to()

size_t mutt_convert_file_to ( FILE *  fp,
const char *  fromcode,
struct Slist const *const  tocodes,
int *  tocode,
struct Content info 
)

Change the encoding of a file.

Parameters
[in]fpFile to convert
[in]fromcodeOriginal encoding
[in]tocodesList of target encodings
[out]tocodeChosen encoding
[out]infoEncoding information
Return values
-1Error, no conversion was possible
>0Success, number of bytes converted

Find the best charset conversion of the file from fromcode into one of the tocodes. If successful, set *tocode and Content *info and return the number of characters converted inexactly.

We convert via UTF-8 in order to avoid the condition -1(EINVAL), which would otherwise prevent us from knowing the number of inexact conversions. Where the candidate target charset is UTF-8 we avoid doing the second conversion because iconv_open("UTF-8", "UTF-8") fails with some libraries.

We assume that the output from iconv is never more than 4 times as long as the input for any pair of charsets we might be interested in.

Definition at line 63 of file convert.c.

65{
66 char bufi[256], bufu[512], bufo[4 * sizeof(bufi)];
67 size_t rc;
68
69 const iconv_t cd1 = mutt_ch_iconv_open("utf-8", fromcode, MUTT_ICONV_NO_FLAGS);
70 if (!iconv_t_valid(cd1))
71 return -1;
72
73 int ncodes = tocodes->count;
74 iconv_t *cd = mutt_mem_calloc(ncodes, sizeof(iconv_t));
75 size_t *score = mutt_mem_calloc(ncodes, sizeof(size_t));
76 struct ContentState *states = mutt_mem_calloc(ncodes, sizeof(struct ContentState));
77 struct Content *infos = mutt_mem_calloc(ncodes, sizeof(struct Content));
78
79 struct ListNode *np = NULL;
80 int ni = 0;
81 STAILQ_FOREACH(np, &tocodes->head, entries)
82 {
83 if (!mutt_istr_equal(np->data, "utf-8"))
84 {
85 cd[ni] = mutt_ch_iconv_open(np->data, "utf-8", MUTT_ICONV_NO_FLAGS);
86 }
87 else
88 {
89 /* Special case for conversion to UTF-8 */
90 cd[ni] = ICONV_T_INVALID;
91 score[ni] = ICONV_ILLEGAL_SEQ;
92 }
93 ni += 1;
94 }
95
96 rewind(fp);
97 size_t ibl = 0;
98 while (true)
99 {
100 /* Try to fill input buffer */
101 size_t n = fread(bufi + ibl, 1, sizeof(bufi) - ibl, fp);
102 ibl += n;
103
104 /* Convert to UTF-8 */
105 const char *ib = bufi;
106 char *ob = bufu;
107 size_t obl = sizeof(bufu);
108 n = iconv(cd1, (ICONV_CONST char **) ((ibl != 0) ? &ib : 0), &ibl, &ob, &obl);
109 if ((n == ICONV_ILLEGAL_SEQ) && (((errno != EINVAL) && (errno != E2BIG)) || (ib == bufi)))
110 {
112 break;
113 }
114 const size_t ubl1 = ob - bufu;
115
116 /* Convert from UTF-8 */
117 for (int i = 0; i < ncodes; i++)
118 {
119 if (iconv_t_valid(cd[i]) && (score[i] != ICONV_ILLEGAL_SEQ))
120 {
121 const char *ub = bufu;
122 size_t ubl = ubl1;
123 ob = bufo;
124 obl = sizeof(bufo);
125 n = iconv(cd[i], (ICONV_CONST char **) ((ibl || ubl) ? &ub : 0), &ubl, &ob, &obl);
126 if (n == ICONV_ILLEGAL_SEQ)
127 {
128 score[i] = ICONV_ILLEGAL_SEQ;
129 }
130 else
131 {
132 score[i] += n;
133 mutt_update_content_info(&infos[i], &states[i], bufo, ob - bufo);
134 }
135 }
136 else if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
137 {
138 /* Special case for conversion to UTF-8 */
139 mutt_update_content_info(&infos[i], &states[i], bufu, ubl1);
140 }
141 }
142
143 if (ibl)
144 {
145 /* Save unused input */
146 memmove(bufi, ib, ibl);
147 }
148 else if (!ubl1 && (ib < bufi + sizeof(bufi)))
149 {
150 rc = 0;
151 break;
152 }
153 }
154
155 if (rc == 0)
156 {
157 /* Find best score */
159 for (int i = 0; i < ncodes; i++)
160 {
161 if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
162 {
163 /* Special case for conversion to UTF-8 */
164 *tocode = i;
165 rc = 0;
166 break;
167 }
168 else if (!iconv_t_valid(cd[i]) || (score[i] == ICONV_ILLEGAL_SEQ))
169 {
170 continue;
171 }
172 else if ((rc == ICONV_ILLEGAL_SEQ) || (score[i] < rc))
173 {
174 *tocode = i;
175 rc = score[i];
176 if (rc == 0)
177 break;
178 }
179 }
180 if (rc != ICONV_ILLEGAL_SEQ)
181 {
182 memcpy(info, &infos[*tocode], sizeof(struct Content));
183 mutt_update_content_info(info, &states[*tocode], 0, 0); /* EOF */
184 }
185 }
186
187 FREE(&cd);
188 FREE(&infos);
189 FREE(&score);
190 FREE(&states);
191
192 return rc;
193}
void mutt_update_content_info(struct Content *info, struct ContentState *s, char *buf, size_t buflen)
Cache some info about an email.
Definition: content_info.c:47
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:585
#define ICONV_T_INVALID
Error value for iconv functions.
Definition: charset.h:100
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
static bool iconv_t_valid(const iconv_t cd)
Is the conversion descriptor valid?
Definition: charset.h:112
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:810
Info about the body of an email.
Definition: content.h:56
Info about an attachment.
Definition: content.h:35
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_get_content_info()

struct Content * mutt_get_content_info ( const char *  fname,
struct Body b,
struct ConfigSubset sub 
)

Analyze file to determine MIME encoding to use.

Parameters
fnameFile to examine
bBody to update
subConfig Subset
Return values
ptrNewly allocated Content

Also set the body charset, sometimes, or not.

Definition at line 186 of file content_info.c.

188{
189 struct Content *info = NULL;
190 struct ContentState cstate = { 0 };
191 FILE *fp = NULL;
192 char *fromcode = NULL;
193 char *tocode = NULL;
194 char buf[100] = { 0 };
195 size_t r;
196
197 struct stat st = { 0 };
198
199 if (b && !fname)
200 fname = b->filename;
201 if (!fname)
202 return NULL;
203
204 if (stat(fname, &st) == -1)
205 {
206 mutt_error(_("Can't stat %s: %s"), fname, strerror(errno));
207 return NULL;
208 }
209
210 if (!S_ISREG(st.st_mode))
211 {
212 mutt_error(_("%s isn't a regular file"), fname);
213 return NULL;
214 }
215
216 fp = fopen(fname, "r");
217 if (!fp)
218 {
219 mutt_debug(LL_DEBUG1, "%s: %s (errno %d)\n", fname, strerror(errno), errno);
220 return NULL;
221 }
222
223 info = mutt_mem_calloc(1, sizeof(struct Content));
224
225 const char *const c_charset = cc_charset();
226 if (b && (b->type == TYPE_TEXT) && (!b->noconv && !b->force_charset))
227 {
228 const struct Slist *const c_attach_charset = cs_subset_slist(sub, "attach_charset");
229 const struct Slist *const c_send_charset = cs_subset_slist(sub, "send_charset");
230 struct Slist *c_charset_slist = slist_parse(c_charset, SLIST_SEP_COLON);
231
232 const struct Slist *fchs = b->use_disp ?
233 (c_attach_charset ? c_attach_charset : c_charset_slist) :
234 c_charset_slist;
235
236 struct Slist *chs = slist_parse(mutt_param_get(&b->parameter, "charset"), SLIST_SEP_COLON);
237
238 if (c_charset && (chs || c_send_charset) &&
239 (mutt_convert_file_from_to(fp, fchs, chs ? chs : c_send_charset, &fromcode,
240 &tocode, info) != ICONV_ILLEGAL_SEQ))
241 {
242 if (!chs)
243 {
244 char chsbuf[256] = { 0 };
245 mutt_ch_canonical_charset(chsbuf, sizeof(chsbuf), tocode);
246 mutt_param_set(&b->parameter, "charset", chsbuf);
247 }
248 FREE(&b->charset);
249 b->charset = mutt_str_dup(fromcode);
250 FREE(&tocode);
251 mutt_file_fclose(&fp);
252 slist_free(&c_charset_slist);
253 slist_free(&chs);
254 return info;
255 }
256
257 slist_free(&c_charset_slist);
258 slist_free(&chs);
259 }
260
261 rewind(fp);
262 while ((r = fread(buf, 1, sizeof(buf), fp)))
263 mutt_update_content_info(info, &cstate, buf, r);
264 mutt_update_content_info(info, &cstate, 0, 0);
265
266 mutt_file_fclose(&fp);
267
268 if (b && (b->type == TYPE_TEXT) && (!b->noconv && !b->force_charset))
269 {
270 mutt_param_set(&b->parameter, "charset",
271 (!info->hibin ? "us-ascii" :
272 c_charset && !mutt_ch_is_us_ascii(c_charset) ? c_charset :
273 "unknown-8bit"));
274 }
275
276 return info;
277}
const char * cc_charset(void)
Get the cached value of $charset.
Definition: cache.c:106
const struct Slist * cs_subset_slist(const struct ConfigSubset *sub, const char *name)
Get a string-list config item by name.
Definition: helpers.c:268
size_t mutt_convert_file_from_to(FILE *fp, const struct Slist *fromcodes, const struct Slist *tocodes, char **fromcode, char **tocode, struct Content *info)
Convert a file between encodings.
Definition: convert.c:212
int mutt_file_fclose(FILE **fp)
Close a FILE handle (and NULL the pointer)
Definition: file.c:150
#define mutt_error(...)
Definition: logging2.h:87
#define mutt_debug(LEVEL,...)
Definition: logging2.h:84
@ LL_DEBUG1
Log at debug level 1.
Definition: logging2.h:40
@ TYPE_TEXT
Type: 'text/*'.
Definition: mime.h:38
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:367
#define mutt_ch_is_us_ascii(str)
Definition: charset.h:97
#define _(a)
Definition: message.h:28
struct Slist * slist_parse(const char *str, uint32_t flags)
Parse a list of strings into a list.
Definition: slist.c:213
void slist_free(struct Slist **list)
Free an Slist object.
Definition: slist.c:162
char * mutt_param_get(const struct ParameterList *pl, const char *s)
Find a matching Parameter.
Definition: parameter.c:84
void mutt_param_set(struct ParameterList *pl, const char *attribute, const char *value)
Set a Parameter.
Definition: parameter.c:110
#define SLIST_SEP_COLON
Definition: slist.h:35
bool noconv
Don't do character set conversion.
Definition: body.h:46
char * charset
Send mode: charset of attached file as stored on disk.
Definition: body.h:78
struct ParameterList parameter
Parameters of the content-type.
Definition: body.h:62
bool use_disp
Content-Disposition uses filename= ?
Definition: body.h:47
bool force_charset
Send mode: don't adjust the character set when in send-mode.
Definition: body.h:44
unsigned int type
content-type primary type, ContentType
Definition: body.h:40
char * filename
When sending a message, this is the file to which this structure refers.
Definition: body.h:58
long hibin
8-bit characters
Definition: content.h:36
String list.
Definition: slist.h:47
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_update_content_info()

void mutt_update_content_info ( struct Content info,
struct ContentState s,
char *  buf,
size_t  buflen 
)

Cache some info about an email.

Parameters
infoInfo about an Attachment
sInfo about the Body of an email
bufBuffer for the result
buflenLength of the buffer

Definition at line 47 of file content_info.c.

49{
50 bool from = s->from;
51 int whitespace = s->whitespace;
52 bool dot = s->dot;
53 int linelen = s->linelen;
54 bool was_cr = s->was_cr;
55
56 if (!buf) /* This signals EOF */
57 {
58 if (was_cr)
59 info->binary = true;
60 if (linelen > info->linemax)
61 info->linemax = linelen;
62
63 return;
64 }
65
66 for (; buflen; buf++, buflen--)
67 {
68 char ch = *buf;
69
70 if (was_cr)
71 {
72 was_cr = false;
73 if (ch == '\n')
74 {
75 if (whitespace)
76 info->space = true;
77 if (dot)
78 info->dot = true;
79 if (linelen > info->linemax)
80 info->linemax = linelen;
81 whitespace = 0;
82 dot = false;
83 linelen = 0;
84 continue;
85 }
86
87 info->binary = true;
88 }
89
90 linelen++;
91 if (ch == '\n')
92 {
93 info->crlf++;
94 if (whitespace)
95 info->space = true;
96 if (dot)
97 info->dot = true;
98 if (linelen > info->linemax)
99 info->linemax = linelen;
100 whitespace = 0;
101 linelen = 0;
102 dot = false;
103 }
104 else if (ch == '\r')
105 {
106 info->crlf++;
107 info->cr = true;
108 was_cr = true;
109 continue;
110 }
111 else if (ch & 0x80)
112 {
113 info->hibin++;
114 }
115 else if ((ch == '\t') || (ch == '\f'))
116 {
117 info->ascii++;
118 whitespace++;
119 }
120 else if (ch == 0)
121 {
122 info->nulbin++;
123 info->lobin++;
124 }
125 else if ((ch < 32) || (ch == 127))
126 {
127 info->lobin++;
128 }
129 else
130 {
131 if (linelen == 1)
132 {
133 if ((ch == 'F') || (ch == 'f'))
134 from = true;
135 else
136 from = false;
137 if (ch == '.')
138 dot = true;
139 else
140 dot = false;
141 }
142 else if (from)
143 {
144 if ((linelen == 2) && (ch != 'r'))
145 {
146 from = false;
147 }
148 else if ((linelen == 3) && (ch != 'o'))
149 {
150 from = false;
151 }
152 else if (linelen == 4)
153 {
154 if (ch == 'm')
155 info->from = true;
156 from = false;
157 }
158 }
159 if (ch == ' ')
160 whitespace++;
161 info->ascii++;
162 }
163
164 if (linelen > 1)
165 dot = false;
166 if ((ch != ' ') && (ch != '\t'))
167 whitespace = 0;
168 }
169
170 s->from = from;
171 s->whitespace = whitespace;
172 s->dot = dot;
173 s->linelen = linelen;
174 s->was_cr = was_cr;
175}
bool was_cr
Was the last character CR?
Definition: content.h:61
int whitespace
Number of trailing whitespaces.
Definition: content.h:58
bool from
Is the current line a prefix of "From "?
Definition: content.h:57
int linelen
Length of the current line.
Definition: content.h:60
bool dot
Was the last character a dot?
Definition: content.h:59
long crlf
\r and \n characters
Definition: content.h:39
bool cr
Has CR, even when in a CRLF pair.
Definition: content.h:46
bool space
Whitespace at the end of lines?
Definition: content.h:42
long ascii
Number of ascii chars.
Definition: content.h:40
bool binary
Long lines, or CR not in CRLF pair.
Definition: content.h:43
bool from
Has a line beginning with "From "?
Definition: content.h:44
long nulbin
Null characters (0x0)
Definition: content.h:38
long linemax
Length of the longest line in the file.
Definition: content.h:41
long lobin
Unprintable 7-bit chars (eg., control chars)
Definition: content.h:37
bool dot
Has a line consisting of a single dot?
Definition: content.h:45
+ Here is the caller graph for this function: