NeoMutt
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
utf7.c File Reference

Convert strings to/from utf7/utf8. More...

#include "config.h"
#include <stdbool.h>
#include <string.h>
#include "private.h"
#include "mutt/lib.h"
#include "core/lib.h"
+ Include dependency graph for utf7.c:

Go to the source code of this file.

Functions

static char * utf7_to_utf8 (const char *u7, size_t u7len, char **u8, size_t *u8len)
 Convert data from RFC2060's UTF-7 to UTF-8.
 
static char * utf8_to_utf7 (const char *u8, size_t u8len, char **u7, size_t *u7len)
 Convert data from UTF-8 to RFC2060's UTF-7.
 
void imap_utf_encode (bool unicode, char **s)
 Encode email from local charset to UTF-8.
 
void imap_utf_decode (bool unicode, char **s)
 Decode email from UTF-8 to local charset.
 

Variables

static const int Index64u [128]
 Lookup table for Base64 encoding/decoding.
 
static const char B64Chars [64]
 Characters of the Base64 encoding.
 

Detailed Description

Convert strings to/from utf7/utf8.

Authors
  • Edmund Grimley Evans

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file utf7.c.

Function Documentation

◆ utf7_to_utf8()

static char * utf7_to_utf8 ( const char *  u7,
size_t  u7len,
char **  u8,
size_t *  u8len 
)
static

Convert data from RFC2060's UTF-7 to UTF-8.

Parameters
[in]u7UTF-7 data
[in]u7lenLength of UTF-7 data
[out]u8Save the UTF-8 data pointer
[out]u8lenSave the UTF-8 data length
Return values
ptrUTF-8 data
NULLError

RFC2060 obviously intends the encoding to be unique (see point 5 in section 5.1.3), so we reject any non-canonical form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead of &AMAAwA-).

Note
The result is null-terminated.
The caller must free() the returned data.

Definition at line 104 of file utf7.c.

105{
106 int b, ch, k;
107
108 char *buf = mutt_mem_malloc(u7len + u7len / 8 + 1);
109 char *p = buf;
110 int pair1 = 0;
111
112 for (; u7len; u7++, u7len--)
113 {
114 if (*u7 == '&')
115 {
116 u7++;
117 u7len--;
118
119 if (u7len && (*u7 == '-'))
120 {
121 *p++ = '&';
122 continue;
123 }
124
125 ch = 0;
126 k = 10;
127 for (; u7len; u7++, u7len--)
128 {
129 if ((*u7 & 0x80) || ((b = Index64u[(int) *u7]) == -1))
130 break;
131 if (k > 0)
132 {
133 ch |= b << k;
134 k -= 6;
135 }
136 else
137 {
138 ch |= b >> (-k);
139 if (ch < 0x80)
140 {
141 if ((0x20 <= ch) && (ch < 0x7f))
142 {
143 /* Printable US-ASCII */
144 goto bail;
145 }
146 *p++ = ch;
147 }
148 else if (ch < 0x800)
149 {
150 *p++ = 0xc0 | (ch >> 6);
151 *p++ = 0x80 | (ch & 0x3f);
152 }
153 else
154 {
155 /* High surrogate pair */
156 if ((ch & ~0x3ff) == 0xd800)
157 {
158 if (pair1)
159 goto bail;
160 pair1 = ch;
161 }
162 else
163 {
164 /* Low surrogate pair */
165 if ((ch & ~0x3ff) == 0xdc00)
166 {
167 if (!pair1)
168 goto bail;
169
170 ch = ((pair1 - 0xd800) << 10) + (ch - 0xdc00) + 0x10000;
171 pair1 = 0;
172 }
173 if (pair1)
174 goto bail;
175
176 if (ch < 0x10000)
177 {
178 *p++ = 0xe0 | (ch >> 12);
179 *p++ = 0x80 | ((ch >> 6) & 0x3f);
180 *p++ = 0x80 | (ch & 0x3f);
181 }
182 else
183 {
184 *p++ = 0xf0 | (ch >> 18);
185 *p++ = 0x80 | ((ch >> 12) & 0x3f);
186 *p++ = 0x80 | ((ch >> 6) & 0x3f);
187 *p++ = 0x80 | (ch & 0x3f);
188 }
189 }
190 }
191
192 ch = (b << (16 + k)) & 0xffff;
193 k += 10;
194 }
195 }
196 if (ch || (k < 6))
197 {
198 /* Non-zero or too many extra bits */
199 goto bail;
200 }
201 if (!u7len || (*u7 != '-'))
202 {
203 /* BASE64 not properly terminated */
204 goto bail;
205 }
206 if ((u7len > 2) && (u7[1] == '&') && (u7[2] != '-'))
207 {
208 /* Adjacent BASE64 sections */
209 goto bail;
210 }
211 }
212 else if ((*u7 < 0x20) || (*u7 >= 0x7f))
213 {
214 /* Not printable US-ASCII */
215 goto bail;
216 }
217 else
218 {
219 *p++ = *u7;
220 }
221 }
222 *p++ = '\0';
223 if (u8len)
224 *u8len = p - buf;
225
226 mutt_mem_realloc(&buf, p - buf);
227 if (u8)
228 *u8 = buf;
229 return buf;
230
231bail:
232 FREE(&buf);
233 return NULL;
234}
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
void mutt_mem_realloc(void *ptr, size_t size)
Resize a block of memory on the heap.
Definition: memory.c:114
#define FREE(x)
Definition: memory.h:45
static const int Index64u[128]
Lookup table for Base64 encoding/decoding.
Definition: utf7.c:64
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ utf8_to_utf7()

static char * utf8_to_utf7 ( const char *  u8,
size_t  u8len,
char **  u7,
size_t *  u7len 
)
static

Convert data from UTF-8 to RFC2060's UTF-7.

Parameters
[in]u8UTF-8 data
[in]u8lenLength of UTF-8 data
[out]u7Save the UTF-7 data pointer
[out]u7lenSave the UTF-7 data length
Return values
ptrUTF-7 data
NULLError

Unicode characters above U+FFFF converted to a UTF-16 surrogate pair.

Note
The result is null-terminated.
The caller must free() the returned data.

Definition at line 250 of file utf7.c.

251{
252 int ch;
253 int n, b = 0, k = 0;
254 bool base64 = false;
255
256 /* In the worst case we convert 2 chars to 7 chars. For example:
257 * "\x10&\x10&..." -> "&ABA-&-&ABA-&-...". */
258 char *buf = mutt_mem_malloc((u8len / 2) * 7 + 6);
259 char *p = buf;
260
261 while (u8len)
262 {
263 unsigned char c = *u8;
264
265 if (c < 0x80)
266 {
267 ch = c;
268 n = 0;
269 }
270 else if (c < 0xc2)
271 {
272 goto bail;
273 }
274 else if (c < 0xe0)
275 {
276 ch = c & 0x1f;
277 n = 1;
278 }
279 else if (c < 0xf0)
280 {
281 ch = c & 0x0f;
282 n = 2;
283 }
284 else if (c < 0xf8)
285 {
286 ch = c & 0x07;
287 n = 3;
288 }
289 else if (c < 0xfc)
290 {
291 ch = c & 0x03;
292 n = 4;
293 }
294 else if (c < 0xfe)
295 {
296 ch = c & 0x01;
297 n = 5;
298 }
299 else
300 {
301 goto bail;
302 }
303
304 u8++;
305 u8len--;
306 if (n > u8len)
307 goto bail;
308 for (int i = 0; i < n; i++)
309 {
310 if ((u8[i] & 0xc0) != 0x80)
311 goto bail;
312 ch = (ch << 6) | (u8[i] & 0x3f);
313 }
314 if ((n > 1) && !(ch >> (n * 5 + 1)))
315 goto bail;
316 u8 += n;
317 u8len -= n;
318
319 if ((ch < 0x20) || (ch >= 0x7f))
320 {
321 if (!base64)
322 {
323 *p++ = '&';
324 base64 = true;
325 b = 0;
326 k = 10;
327 }
328
329 // For code points >= 0x10000 we need to use a UTF-16 surrogate pair
330 if (ch & ~0xffff)
331 {
332 ch -= 0x10000;
333 int pair1 = 0xd800 + (ch >> 10);
334 int pair2 = 0xdc00 + (ch & 0x3ff);
335
336 /* Output the high surrogate */
337 *p++ = B64Chars[b | pair1 >> k];
338 k -= 6;
339 for (; k >= 0; k -= 6)
340 *p++ = B64Chars[(pair1 >> k) & 0x3f];
341 b = (pair1 << (-k)) & 0x3f;
342 k += 16;
343
344 /* The low surrogate will be output just below */
345 ch = pair2;
346 }
347
348 *p++ = B64Chars[b | ch >> k];
349 k -= 6;
350 for (; k >= 0; k -= 6)
351 *p++ = B64Chars[(ch >> k) & 0x3f];
352 b = (ch << (-k)) & 0x3f;
353 k += 16;
354 }
355 else
356 {
357 if (base64)
358 {
359 if (k > 10)
360 *p++ = B64Chars[b];
361 *p++ = '-';
362 base64 = false;
363 }
364 *p++ = ch;
365 if (ch == '&')
366 *p++ = '-';
367 }
368 }
369
370 if (base64)
371 {
372 if (k > 10)
373 *p++ = B64Chars[b];
374 *p++ = '-';
375 }
376
377 *p++ = '\0';
378 if (u7len)
379 *u7len = p - buf;
380 mutt_mem_realloc(&buf, p - buf);
381 if (u7)
382 *u7 = buf;
383 return buf;
384
385bail:
386 FREE(&buf);
387 return NULL;
388}
static const char B64Chars[64]
Characters of the Base64 encoding.
Definition: utf7.c:80
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ imap_utf_encode()

void imap_utf_encode ( bool  unicode,
char **  s 
)

Encode email from local charset to UTF-8.

Parameters
[in]unicodetrue if Unicode is allowed
[out]sEmail to convert

Definition at line 395 of file utf7.c.

396{
397 if (!s || !*s)
398 return;
399
400 const char *c_charset = cc_charset();
401 if (!c_charset)
402 return;
403
404 if (unicode && mutt_ch_is_utf8(c_charset))
405 {
406 return;
407 }
408
409 if (mutt_ch_convert_string(s, c_charset, "utf-8", MUTT_ICONV_NO_FLAGS) != 0)
410 {
411 FREE(s);
412 return;
413 }
414
415 if (!unicode)
416 {
417 char *utf7 = utf8_to_utf7(*s, strlen(*s), NULL, 0);
418 FREE(s);
419 *s = utf7;
420 }
421}
const char * cc_charset(void)
Get the cached value of $charset.
Definition: config_cache.c:115
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:826
#define mutt_ch_is_utf8(str)
Definition: charset.h:96
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
static char * utf8_to_utf7(const char *u8, size_t u8len, char **u7, size_t *u7len)
Convert data from UTF-8 to RFC2060's UTF-7.
Definition: utf7.c:250
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ imap_utf_decode()

void imap_utf_decode ( bool  unicode,
char **  s 
)

Decode email from UTF-8 to local charset.

Parameters
[in]unicodetrue if Unicode is allowed
[out]sEmail to convert

Definition at line 428 of file utf7.c.

429{
430 if (!s || !*s)
431 return;
432
433 const char *c_charset = cc_charset();
434 if (!c_charset)
435 return;
436
437 if (unicode && mutt_ch_is_utf8(c_charset))
438 {
439 return;
440 }
441
442 if (!unicode)
443 {
444 char *utf8 = utf7_to_utf8(*s, strlen(*s), 0, 0);
445 FREE(s);
446 *s = utf8;
447 }
448
449 if (mutt_ch_convert_string(s, "utf-8", c_charset, MUTT_ICONV_NO_FLAGS) != 0)
450 {
451 FREE(s);
452 }
453}
static char * utf7_to_utf8(const char *u7, size_t u7len, char **u8, size_t *u8len)
Convert data from RFC2060's UTF-7 to UTF-8.
Definition: utf7.c:104
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ Index64u

const int Index64u[128]
static
Initial value:
= {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
}

Lookup table for Base64 encoding/decoding.

This is very similar to the table in lib/lib_base64.c Encoding chars: utf7 A-Za-z0-9+, mime A-Za-z0-9+/

Definition at line 64 of file utf7.c.

◆ B64Chars

const char B64Chars[64]
static
Initial value:
= {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',',
}

Characters of the Base64 encoding.

Definition at line 80 of file utf7.c.