NeoMutt  2022-04-29-215-gc12b98
Teaching an old dog new tricks
DOXYGEN
utf7.c
Go to the documentation of this file.
1
49#include "config.h"
50#include <stdbool.h>
51#include <string.h>
52#include "private.h"
53#include "mutt/lib.h"
54#include "config/lib.h"
55#include "core/lib.h"
56
65const int Index64u[128] = {
66 // clang-format off
67 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
68 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
69 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
70 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
71 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
72 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
73 -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
74 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
75 // clang-format on
76};
77
81static const char B64Chars[64] = {
82 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
83 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
84 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
85 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
86 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',',
87};
88
105static char *utf7_to_utf8(const char *u7, size_t u7len, char **u8, size_t *u8len)
106{
107 int b, ch, k;
108
109 char *buf = mutt_mem_malloc(u7len + u7len / 8 + 1);
110 char *p = buf;
111 int pair1 = 0;
112
113 for (; u7len; u7++, u7len--)
114 {
115 if (*u7 == '&')
116 {
117 u7++;
118 u7len--;
119
120 if (u7len && (*u7 == '-'))
121 {
122 *p++ = '&';
123 continue;
124 }
125
126 ch = 0;
127 k = 10;
128 for (; u7len; u7++, u7len--)
129 {
130 if ((*u7 & 0x80) || ((b = Index64u[(int) *u7]) == -1))
131 break;
132 if (k > 0)
133 {
134 ch |= b << k;
135 k -= 6;
136 }
137 else
138 {
139 ch |= b >> (-k);
140 if (ch < 0x80)
141 {
142 if ((0x20 <= ch) && (ch < 0x7f))
143 {
144 /* Printable US-ASCII */
145 goto bail;
146 }
147 *p++ = ch;
148 }
149 else if (ch < 0x800)
150 {
151 *p++ = 0xc0 | (ch >> 6);
152 *p++ = 0x80 | (ch & 0x3f);
153 }
154 else
155 {
156 /* High surrogate pair */
157 if ((ch & ~0x3ff) == 0xd800)
158 {
159 if (pair1)
160 goto bail;
161 pair1 = ch;
162 }
163 else
164 {
165 /* Low surrogate pair */
166 if ((ch & ~0x3ff) == 0xdc00)
167 {
168 if (!pair1)
169 goto bail;
170
171 ch = ((pair1 - 0xd800) << 10) + (ch - 0xdc00) + 0x10000;
172 pair1 = 0;
173 }
174 if (pair1)
175 goto bail;
176
177 if (ch < 0x10000)
178 {
179 *p++ = 0xe0 | (ch >> 12);
180 *p++ = 0x80 | ((ch >> 6) & 0x3f);
181 *p++ = 0x80 | (ch & 0x3f);
182 }
183 else
184 {
185 *p++ = 0xf0 | (ch >> 18);
186 *p++ = 0x80 | ((ch >> 12) & 0x3f);
187 *p++ = 0x80 | ((ch >> 6) & 0x3f);
188 *p++ = 0x80 | (ch & 0x3f);
189 }
190 }
191 }
192
193 ch = (b << (16 + k)) & 0xffff;
194 k += 10;
195 }
196 }
197 if (ch || (k < 6))
198 {
199 /* Non-zero or too many extra bits */
200 goto bail;
201 }
202 if (!u7len || (*u7 != '-'))
203 {
204 /* BASE64 not properly terminated */
205 goto bail;
206 }
207 if ((u7len > 2) && (u7[1] == '&') && (u7[2] != '-'))
208 {
209 /* Adjacent BASE64 sections */
210 goto bail;
211 }
212 }
213 else if ((*u7 < 0x20) || (*u7 >= 0x7f))
214 {
215 /* Not printable US-ASCII */
216 goto bail;
217 }
218 else
219 *p++ = *u7;
220 }
221 *p++ = '\0';
222 if (u8len)
223 *u8len = p - buf;
224
225 mutt_mem_realloc(&buf, p - buf);
226 if (u8)
227 *u8 = buf;
228 return buf;
229
230bail:
231 FREE(&buf);
232 return NULL;
233}
234
249static char *utf8_to_utf7(const char *u8, size_t u8len, char **u7, size_t *u7len)
250{
251 int ch;
252 int n, b = 0, k = 0;
253 bool base64 = false;
254
255 /* In the worst case we convert 2 chars to 7 chars. For example:
256 * "\x10&\x10&..." -> "&ABA-&-&ABA-&-...". */
257 char *buf = mutt_mem_malloc((u8len / 2) * 7 + 6);
258 char *p = buf;
259
260 while (u8len)
261 {
262 unsigned char c = *u8;
263
264 if (c < 0x80)
265 {
266 ch = c;
267 n = 0;
268 }
269 else if (c < 0xc2)
270 goto bail;
271 else if (c < 0xe0)
272 {
273 ch = c & 0x1f;
274 n = 1;
275 }
276 else if (c < 0xf0)
277 {
278 ch = c & 0x0f;
279 n = 2;
280 }
281 else if (c < 0xf8)
282 {
283 ch = c & 0x07;
284 n = 3;
285 }
286 else if (c < 0xfc)
287 {
288 ch = c & 0x03;
289 n = 4;
290 }
291 else if (c < 0xfe)
292 {
293 ch = c & 0x01;
294 n = 5;
295 }
296 else
297 goto bail;
298
299 u8++;
300 u8len--;
301 if (n > u8len)
302 goto bail;
303 for (int i = 0; i < n; i++)
304 {
305 if ((u8[i] & 0xc0) != 0x80)
306 goto bail;
307 ch = (ch << 6) | (u8[i] & 0x3f);
308 }
309 if ((n > 1) && !(ch >> (n * 5 + 1)))
310 goto bail;
311 u8 += n;
312 u8len -= n;
313
314 if ((ch < 0x20) || (ch >= 0x7f))
315 {
316 if (!base64)
317 {
318 *p++ = '&';
319 base64 = true;
320 b = 0;
321 k = 10;
322 }
323
324 // For code points >= 0x10000 we need to use a UTF-16 surrogate pair
325 if (ch & ~0xffff)
326 {
327 ch -= 0x10000;
328 int pair1 = 0xd800 + (ch >> 10);
329 int pair2 = 0xdc00 + (ch & 0x3ff);
330
331 /* Output the high surrogate */
332 *p++ = B64Chars[b | pair1 >> k];
333 k -= 6;
334 for (; k >= 0; k -= 6)
335 *p++ = B64Chars[(pair1 >> k) & 0x3f];
336 b = (pair1 << (-k)) & 0x3f;
337 k += 16;
338
339 /* The low surrogate will be output just below */
340 ch = pair2;
341 }
342
343 *p++ = B64Chars[b | ch >> k];
344 k -= 6;
345 for (; k >= 0; k -= 6)
346 *p++ = B64Chars[(ch >> k) & 0x3f];
347 b = (ch << (-k)) & 0x3f;
348 k += 16;
349 }
350 else
351 {
352 if (base64)
353 {
354 if (k > 10)
355 *p++ = B64Chars[b];
356 *p++ = '-';
357 base64 = false;
358 }
359 *p++ = ch;
360 if (ch == '&')
361 *p++ = '-';
362 }
363 }
364
365 if (base64)
366 {
367 if (k > 10)
368 *p++ = B64Chars[b];
369 *p++ = '-';
370 }
371
372 *p++ = '\0';
373 if (u7len)
374 *u7len = p - buf;
375 mutt_mem_realloc(&buf, p - buf);
376 if (u7)
377 *u7 = buf;
378 return buf;
379
380bail:
381 FREE(&buf);
382 return NULL;
383}
384
390void imap_utf_encode(bool unicode, char **s)
391{
392 const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
393 if (!c_charset || !s || !*s)
394 return;
395
396 if (unicode && mutt_ch_is_utf8(c_charset))
397 {
398 return;
399 }
400
401 if (mutt_ch_convert_string(s, c_charset, "utf-8", MUTT_ICONV_NO_FLAGS) != 0)
402 {
403 FREE(s);
404 return;
405 }
406
407 if (!unicode)
408 {
409 char *utf7 = utf8_to_utf7(*s, strlen(*s), NULL, 0);
410 FREE(s);
411 *s = utf7;
412 }
413}
414
420void imap_utf_decode(bool unicode, char **s)
421{
422 const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
423 if (!c_charset || !s || !*s)
424 return;
425
426 if (unicode && mutt_ch_is_utf8(c_charset))
427 {
428 return;
429 }
430
431 if (!unicode)
432 {
433 char *utf8 = utf7_to_utf8(*s, strlen(*s), 0, 0);
434 FREE(s);
435 *s = utf8;
436 }
437
438 if (mutt_ch_convert_string(s, "utf-8", c_charset, MUTT_ICONV_NO_FLAGS) != 0)
439 {
440 FREE(s);
441 }
442}
const char * cs_subset_string(const struct ConfigSubset *sub, const char *name)
Get a string config item by name.
Definition: helpers.c:317
Convenience wrapper for the config headers.
Convenience wrapper for the core headers.
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
void mutt_mem_realloc(void *ptr, size_t size)
Resize a block of memory on the heap.
Definition: memory.c:114
#define FREE(x)
Definition: memory.h:43
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:752
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
Convenience wrapper for the library headers.
GUI display the mailboxes in a side panel.
Container for Accounts, Notifications.
Definition: neomutt.h:37
struct ConfigSubset * sub
Inherited config items.
Definition: neomutt.h:39
void imap_utf_encode(bool unicode, char **s)
Encode email from local charset to UTF-8.
Definition: utf7.c:390
static char * utf8_to_utf7(const char *u8, size_t u8len, char **u7, size_t *u7len)
Convert data from UTF-8 to RFC2060's UTF-7.
Definition: utf7.c:249
void imap_utf_decode(bool unicode, char **s)
Decode email from UTF-8 to local charset.
Definition: utf7.c:420
static const char B64Chars[64]
Characters of the Base64 encoding.
Definition: utf7.c:81
static char * utf7_to_utf8(const char *u7, size_t u7len, char **u8, size_t *u8len)
Convert data from RFC2060's UTF-7 to UTF-8.
Definition: utf7.c:105
const int Index64u[128]
Lookup table for Base64 encoding/decoding.
Definition: utf7.c:65