NeoMutt  2025-01-09-41-g086358
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
utf7.c
Go to the documentation of this file.
1
51#include "config.h"
52#include <stdbool.h>
53#include <string.h>
54#include "private.h"
55#include "mutt/lib.h"
56#include "core/lib.h"
57
66static const int Index64u[128] = {
67 // clang-format off
68 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
69 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
70 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
71 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
72 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
73 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
74 -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
75 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
76 // clang-format on
77};
78
82static const char B64Chars[64] = {
83 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
84 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
85 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
86 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
87 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',',
88};
89
106static char *utf7_to_utf8(const char *u7, size_t u7len, char **u8, size_t *u8len)
107{
108 int b, ch, k;
109
110 char *buf = MUTT_MEM_MALLOC(u7len + u7len / 8 + 1, char);
111 char *p = buf;
112 int pair1 = 0;
113
114 for (; u7len; u7++, u7len--)
115 {
116 if (*u7 == '&')
117 {
118 u7++;
119 u7len--;
120
121 if (u7len && (*u7 == '-'))
122 {
123 *p++ = '&';
124 continue;
125 }
126
127 ch = 0;
128 k = 10;
129 for (; u7len; u7++, u7len--)
130 {
131 if ((*u7 & 0x80) || ((b = Index64u[(int) *u7]) == -1))
132 break;
133 if (k > 0)
134 {
135 ch |= b << k;
136 k -= 6;
137 }
138 else
139 {
140 ch |= b >> (-k);
141 if (ch < 0x80)
142 {
143 if ((0x20 <= ch) && (ch < 0x7f))
144 {
145 /* Printable US-ASCII */
146 goto bail;
147 }
148 *p++ = ch;
149 }
150 else if (ch < 0x800)
151 {
152 *p++ = 0xc0 | (ch >> 6);
153 *p++ = 0x80 | (ch & 0x3f);
154 }
155 else
156 {
157 /* High surrogate pair */
158 if ((ch & ~0x3ff) == 0xd800)
159 {
160 if (pair1)
161 goto bail;
162 pair1 = ch;
163 }
164 else
165 {
166 /* Low surrogate pair */
167 if ((ch & ~0x3ff) == 0xdc00)
168 {
169 if (!pair1)
170 goto bail;
171
172 ch = ((pair1 - 0xd800) << 10) + (ch - 0xdc00) + 0x10000;
173 pair1 = 0;
174 }
175 if (pair1)
176 goto bail;
177
178 if (ch < 0x10000)
179 {
180 *p++ = 0xe0 | (ch >> 12);
181 *p++ = 0x80 | ((ch >> 6) & 0x3f);
182 *p++ = 0x80 | (ch & 0x3f);
183 }
184 else
185 {
186 *p++ = 0xf0 | (ch >> 18);
187 *p++ = 0x80 | ((ch >> 12) & 0x3f);
188 *p++ = 0x80 | ((ch >> 6) & 0x3f);
189 *p++ = 0x80 | (ch & 0x3f);
190 }
191 }
192 }
193
194 ch = (b << (16 + k)) & 0xffff;
195 k += 10;
196 }
197 }
198 if (ch || (k < 6))
199 {
200 /* Non-zero or too many extra bits */
201 goto bail;
202 }
203 if (!u7len || (*u7 != '-'))
204 {
205 /* BASE64 not properly terminated */
206 goto bail;
207 }
208 if ((u7len > 2) && (u7[1] == '&') && (u7[2] != '-'))
209 {
210 /* Adjacent BASE64 sections */
211 goto bail;
212 }
213 }
214 else if ((*u7 < 0x20) || (*u7 >= 0x7f))
215 {
216 /* Not printable US-ASCII */
217 goto bail;
218 }
219 else
220 {
221 *p++ = *u7;
222 }
223 }
224 *p++ = '\0';
225 if (u8len)
226 *u8len = p - buf;
227
228 MUTT_MEM_REALLOC(&buf, p - buf, char);
229 if (u8)
230 *u8 = buf;
231 return buf;
232
233bail:
234 FREE(&buf);
235 return NULL;
236}
237
252static char *utf8_to_utf7(const char *u8, size_t u8len, char **u7, size_t *u7len)
253{
254 int ch;
255 int n, b = 0, k = 0;
256 bool base64 = false;
257
258 /* In the worst case we convert 2 chars to 7 chars. For example:
259 * "\x10&\x10&..." -> "&ABA-&-&ABA-&-...". */
260 char *buf = MUTT_MEM_MALLOC((u8len / 2) * 7 + 6, char);
261 char *p = buf;
262
263 while (u8len)
264 {
265 unsigned char c = *u8;
266
267 if (c < 0x80)
268 {
269 ch = c;
270 n = 0;
271 }
272 else if (c < 0xc2)
273 {
274 goto bail;
275 }
276 else if (c < 0xe0)
277 {
278 ch = c & 0x1f;
279 n = 1;
280 }
281 else if (c < 0xf0)
282 {
283 ch = c & 0x0f;
284 n = 2;
285 }
286 else if (c < 0xf8)
287 {
288 ch = c & 0x07;
289 n = 3;
290 }
291 else if (c < 0xfc)
292 {
293 ch = c & 0x03;
294 n = 4;
295 }
296 else if (c < 0xfe)
297 {
298 ch = c & 0x01;
299 n = 5;
300 }
301 else
302 {
303 goto bail;
304 }
305
306 u8++;
307 u8len--;
308 if (n > u8len)
309 goto bail;
310 for (int i = 0; i < n; i++)
311 {
312 if ((u8[i] & 0xc0) != 0x80)
313 goto bail;
314 ch = (ch << 6) | (u8[i] & 0x3f);
315 }
316 if ((n > 1) && !(ch >> (n * 5 + 1)))
317 goto bail;
318 u8 += n;
319 u8len -= n;
320
321 if ((ch < 0x20) || (ch >= 0x7f))
322 {
323 if (!base64)
324 {
325 *p++ = '&';
326 base64 = true;
327 b = 0;
328 k = 10;
329 }
330
331 // For code points >= 0x10000 we need to use a UTF-16 surrogate pair
332 if (ch & ~0xffff)
333 {
334 ch -= 0x10000;
335 int pair1 = 0xd800 + (ch >> 10);
336 int pair2 = 0xdc00 + (ch & 0x3ff);
337
338 /* Output the high surrogate */
339 *p++ = B64Chars[b | pair1 >> k];
340 k -= 6;
341 for (; k >= 0; k -= 6)
342 *p++ = B64Chars[(pair1 >> k) & 0x3f];
343 b = (pair1 << (-k)) & 0x3f;
344 k += 16;
345
346 /* The low surrogate will be output just below */
347 ch = pair2;
348 }
349
350 *p++ = B64Chars[b | ch >> k];
351 k -= 6;
352 for (; k >= 0; k -= 6)
353 *p++ = B64Chars[(ch >> k) & 0x3f];
354 b = (ch << (-k)) & 0x3f;
355 k += 16;
356 }
357 else
358 {
359 if (base64)
360 {
361 if (k > 10)
362 *p++ = B64Chars[b];
363 *p++ = '-';
364 base64 = false;
365 }
366 *p++ = ch;
367 if (ch == '&')
368 *p++ = '-';
369 }
370 }
371
372 if (base64)
373 {
374 if (k > 10)
375 *p++ = B64Chars[b];
376 *p++ = '-';
377 }
378
379 *p++ = '\0';
380 if (u7len)
381 *u7len = p - buf;
382 MUTT_MEM_REALLOC(&buf, p - buf, char);
383 if (u7)
384 *u7 = buf;
385 return buf;
386
387bail:
388 FREE(&buf);
389 return NULL;
390}
391
397void imap_utf_encode(bool unicode, char **s)
398{
399 if (!s || !*s)
400 return;
401
402 const char *c_charset = cc_charset();
403 if (!c_charset)
404 return;
405
406 if (unicode && mutt_ch_is_utf8(c_charset))
407 {
408 return;
409 }
410
411 if (mutt_ch_convert_string(s, c_charset, "utf-8", MUTT_ICONV_NO_FLAGS) != 0)
412 {
413 FREE(s);
414 return;
415 }
416
417 if (!unicode)
418 {
419 char *utf7 = utf8_to_utf7(*s, strlen(*s), NULL, 0);
420 FREE(s);
421 *s = utf7;
422 }
423}
424
430void imap_utf_decode(bool unicode, char **s)
431{
432 if (!s || !*s)
433 return;
434
435 const char *c_charset = cc_charset();
436 if (!c_charset)
437 return;
438
439 if (unicode && mutt_ch_is_utf8(c_charset))
440 {
441 return;
442 }
443
444 if (!unicode)
445 {
446 char *utf8 = utf7_to_utf8(*s, strlen(*s), 0, 0);
447 FREE(s);
448 *s = utf8;
449 }
450
451 if (mutt_ch_convert_string(s, "utf-8", c_charset, MUTT_ICONV_NO_FLAGS) != 0)
452 {
453 FREE(s);
454 }
455}
const char * cc_charset(void)
Get the cached value of $charset.
Definition: config_cache.c:116
Convenience wrapper for the core headers.
#define FREE(x)
Definition: memory.h:55
#define MUTT_MEM_REALLOC(pptr, n, type)
Definition: memory.h:43
#define MUTT_MEM_MALLOC(n, type)
Definition: memory.h:41
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:831
#define mutt_ch_is_utf8(str)
Definition: charset.h:89
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:64
Convenience wrapper for the library headers.
GUI display the mailboxes in a side panel.
void imap_utf_encode(bool unicode, char **s)
Encode email from local charset to UTF-8.
Definition: utf7.c:397
static char * utf8_to_utf7(const char *u8, size_t u8len, char **u7, size_t *u7len)
Convert data from UTF-8 to RFC2060's UTF-7.
Definition: utf7.c:252
void imap_utf_decode(bool unicode, char **s)
Decode email from UTF-8 to local charset.
Definition: utf7.c:430
static const char B64Chars[64]
Characters of the Base64 encoding.
Definition: utf7.c:82
static char * utf7_to_utf8(const char *u7, size_t u7len, char **u8, size_t *u8len)
Convert data from RFC2060's UTF-7 to UTF-8.
Definition: utf7.c:106
static const int Index64u[128]
Lookup table for Base64 encoding/decoding.
Definition: utf7.c:66