NeoMutt  2024-11-14-138-ge5ca67
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
utf7.c File Reference

Convert strings to/from utf7/utf8. More...

#include "config.h"
#include <stdbool.h>
#include <string.h>
#include "private.h"
#include "mutt/lib.h"
#include "core/lib.h"
+ Include dependency graph for utf7.c:

Go to the source code of this file.

Functions

static char * utf7_to_utf8 (const char *u7, size_t u7len, char **u8, size_t *u8len)
 Convert data from RFC2060's UTF-7 to UTF-8.
 
static char * utf8_to_utf7 (const char *u8, size_t u8len, char **u7, size_t *u7len)
 Convert data from UTF-8 to RFC2060's UTF-7.
 
void imap_utf_encode (bool unicode, char **s)
 Encode email from local charset to UTF-8.
 
void imap_utf_decode (bool unicode, char **s)
 Decode email from UTF-8 to local charset.
 

Variables

static const int Index64u [128]
 Lookup table for Base64 encoding/decoding.
 
static const char B64Chars [64]
 Characters of the Base64 encoding.
 

Detailed Description

Convert strings to/from utf7/utf8.

Authors
  • Edmund Grimley Evans
  • Richard Russon
  • Pietro Cerutti

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file utf7.c.

Function Documentation

◆ utf7_to_utf8()

static char * utf7_to_utf8 ( const char *  u7,
size_t  u7len,
char **  u8,
size_t *  u8len 
)
static

Convert data from RFC2060's UTF-7 to UTF-8.

Parameters
[in]u7UTF-7 data
[in]u7lenLength of UTF-7 data
[out]u8Save the UTF-8 data pointer
[out]u8lenSave the UTF-8 data length
Return values
ptrUTF-8 data
NULLError

RFC2060 obviously intends the encoding to be unique (see point 5 in section 5.1.3), so we reject any non-canonical form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead of &AMAAwA-).

Note
The result is null-terminated.
The caller must free() the returned data.

Definition at line 106 of file utf7.c.

107{
108 int b, ch, k;
109
110 char *buf = MUTT_MEM_MALLOC(u7len + u7len / 8 + 1, char);
111 char *p = buf;
112 int pair1 = 0;
113
114 for (; u7len; u7++, u7len--)
115 {
116 if (*u7 == '&')
117 {
118 u7++;
119 u7len--;
120
121 if (u7len && (*u7 == '-'))
122 {
123 *p++ = '&';
124 continue;
125 }
126
127 ch = 0;
128 k = 10;
129 for (; u7len; u7++, u7len--)
130 {
131 if ((*u7 & 0x80) || ((b = Index64u[(int) *u7]) == -1))
132 break;
133 if (k > 0)
134 {
135 ch |= b << k;
136 k -= 6;
137 }
138 else
139 {
140 ch |= b >> (-k);
141 if (ch < 0x80)
142 {
143 if ((0x20 <= ch) && (ch < 0x7f))
144 {
145 /* Printable US-ASCII */
146 goto bail;
147 }
148 *p++ = ch;
149 }
150 else if (ch < 0x800)
151 {
152 *p++ = 0xc0 | (ch >> 6);
153 *p++ = 0x80 | (ch & 0x3f);
154 }
155 else
156 {
157 /* High surrogate pair */
158 if ((ch & ~0x3ff) == 0xd800)
159 {
160 if (pair1)
161 goto bail;
162 pair1 = ch;
163 }
164 else
165 {
166 /* Low surrogate pair */
167 if ((ch & ~0x3ff) == 0xdc00)
168 {
169 if (!pair1)
170 goto bail;
171
172 ch = ((pair1 - 0xd800) << 10) + (ch - 0xdc00) + 0x10000;
173 pair1 = 0;
174 }
175 if (pair1)
176 goto bail;
177
178 if (ch < 0x10000)
179 {
180 *p++ = 0xe0 | (ch >> 12);
181 *p++ = 0x80 | ((ch >> 6) & 0x3f);
182 *p++ = 0x80 | (ch & 0x3f);
183 }
184 else
185 {
186 *p++ = 0xf0 | (ch >> 18);
187 *p++ = 0x80 | ((ch >> 12) & 0x3f);
188 *p++ = 0x80 | ((ch >> 6) & 0x3f);
189 *p++ = 0x80 | (ch & 0x3f);
190 }
191 }
192 }
193
194 ch = (b << (16 + k)) & 0xffff;
195 k += 10;
196 }
197 }
198 if (ch || (k < 6))
199 {
200 /* Non-zero or too many extra bits */
201 goto bail;
202 }
203 if (!u7len || (*u7 != '-'))
204 {
205 /* BASE64 not properly terminated */
206 goto bail;
207 }
208 if ((u7len > 2) && (u7[1] == '&') && (u7[2] != '-'))
209 {
210 /* Adjacent BASE64 sections */
211 goto bail;
212 }
213 }
214 else if ((*u7 < 0x20) || (*u7 >= 0x7f))
215 {
216 /* Not printable US-ASCII */
217 goto bail;
218 }
219 else
220 {
221 *p++ = *u7;
222 }
223 }
224 *p++ = '\0';
225 if (u8len)
226 *u8len = p - buf;
227
228 MUTT_MEM_REALLOC(&buf, p - buf, char);
229 if (u8)
230 *u8 = buf;
231 return buf;
232
233bail:
234 FREE(&buf);
235 return NULL;
236}
#define FREE(x)
Definition: memory.h:55
#define MUTT_MEM_REALLOC(pptr, n, type)
Definition: memory.h:43
#define MUTT_MEM_MALLOC(n, type)
Definition: memory.h:41
static const int Index64u[128]
Lookup table for Base64 encoding/decoding.
Definition: utf7.c:66
+ Here is the caller graph for this function:

◆ utf8_to_utf7()

static char * utf8_to_utf7 ( const char *  u8,
size_t  u8len,
char **  u7,
size_t *  u7len 
)
static

Convert data from UTF-8 to RFC2060's UTF-7.

Parameters
[in]u8UTF-8 data
[in]u8lenLength of UTF-8 data
[out]u7Save the UTF-7 data pointer
[out]u7lenSave the UTF-7 data length
Return values
ptrUTF-7 data
NULLError

Unicode characters above U+FFFF converted to a UTF-16 surrogate pair.

Note
The result is null-terminated.
The caller must free() the returned data.

Definition at line 252 of file utf7.c.

253{
254 int ch;
255 int n, b = 0, k = 0;
256 bool base64 = false;
257
258 /* In the worst case we convert 2 chars to 7 chars. For example:
259 * "\x10&\x10&..." -> "&ABA-&-&ABA-&-...". */
260 char *buf = MUTT_MEM_MALLOC((u8len / 2) * 7 + 6, char);
261 char *p = buf;
262
263 while (u8len)
264 {
265 unsigned char c = *u8;
266
267 if (c < 0x80)
268 {
269 ch = c;
270 n = 0;
271 }
272 else if (c < 0xc2)
273 {
274 goto bail;
275 }
276 else if (c < 0xe0)
277 {
278 ch = c & 0x1f;
279 n = 1;
280 }
281 else if (c < 0xf0)
282 {
283 ch = c & 0x0f;
284 n = 2;
285 }
286 else if (c < 0xf8)
287 {
288 ch = c & 0x07;
289 n = 3;
290 }
291 else if (c < 0xfc)
292 {
293 ch = c & 0x03;
294 n = 4;
295 }
296 else if (c < 0xfe)
297 {
298 ch = c & 0x01;
299 n = 5;
300 }
301 else
302 {
303 goto bail;
304 }
305
306 u8++;
307 u8len--;
308 if (n > u8len)
309 goto bail;
310 for (int i = 0; i < n; i++)
311 {
312 if ((u8[i] & 0xc0) != 0x80)
313 goto bail;
314 ch = (ch << 6) | (u8[i] & 0x3f);
315 }
316 if ((n > 1) && !(ch >> (n * 5 + 1)))
317 goto bail;
318 u8 += n;
319 u8len -= n;
320
321 if ((ch < 0x20) || (ch >= 0x7f))
322 {
323 if (!base64)
324 {
325 *p++ = '&';
326 base64 = true;
327 b = 0;
328 k = 10;
329 }
330
331 // For code points >= 0x10000 we need to use a UTF-16 surrogate pair
332 if (ch & ~0xffff)
333 {
334 ch -= 0x10000;
335 int pair1 = 0xd800 + (ch >> 10);
336 int pair2 = 0xdc00 + (ch & 0x3ff);
337
338 /* Output the high surrogate */
339 *p++ = B64Chars[b | pair1 >> k];
340 k -= 6;
341 for (; k >= 0; k -= 6)
342 *p++ = B64Chars[(pair1 >> k) & 0x3f];
343 b = (pair1 << (-k)) & 0x3f;
344 k += 16;
345
346 /* The low surrogate will be output just below */
347 ch = pair2;
348 }
349
350 *p++ = B64Chars[b | ch >> k];
351 k -= 6;
352 for (; k >= 0; k -= 6)
353 *p++ = B64Chars[(ch >> k) & 0x3f];
354 b = (ch << (-k)) & 0x3f;
355 k += 16;
356 }
357 else
358 {
359 if (base64)
360 {
361 if (k > 10)
362 *p++ = B64Chars[b];
363 *p++ = '-';
364 base64 = false;
365 }
366 *p++ = ch;
367 if (ch == '&')
368 *p++ = '-';
369 }
370 }
371
372 if (base64)
373 {
374 if (k > 10)
375 *p++ = B64Chars[b];
376 *p++ = '-';
377 }
378
379 *p++ = '\0';
380 if (u7len)
381 *u7len = p - buf;
382 MUTT_MEM_REALLOC(&buf, p - buf, char);
383 if (u7)
384 *u7 = buf;
385 return buf;
386
387bail:
388 FREE(&buf);
389 return NULL;
390}
static const char B64Chars[64]
Characters of the Base64 encoding.
Definition: utf7.c:82
+ Here is the caller graph for this function:

◆ imap_utf_encode()

void imap_utf_encode ( bool  unicode,
char **  s 
)

Encode email from local charset to UTF-8.

Parameters
[in]unicodetrue if Unicode is allowed
[out]sEmail to convert

Definition at line 397 of file utf7.c.

398{
399 if (!s || !*s)
400 return;
401
402 const char *c_charset = cc_charset();
403 if (!c_charset)
404 return;
405
406 if (unicode && mutt_ch_is_utf8(c_charset))
407 {
408 return;
409 }
410
411 if (mutt_ch_convert_string(s, c_charset, "utf-8", MUTT_ICONV_NO_FLAGS) != 0)
412 {
413 FREE(s);
414 return;
415 }
416
417 if (!unicode)
418 {
419 char *utf7 = utf8_to_utf7(*s, strlen(*s), NULL, 0);
420 FREE(s);
421 *s = utf7;
422 }
423}
const char * cc_charset(void)
Get the cached value of $charset.
Definition: config_cache.c:116
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:831
#define mutt_ch_is_utf8(str)
Definition: charset.h:89
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:64
static char * utf8_to_utf7(const char *u8, size_t u8len, char **u7, size_t *u7len)
Convert data from UTF-8 to RFC2060's UTF-7.
Definition: utf7.c:252
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ imap_utf_decode()

void imap_utf_decode ( bool  unicode,
char **  s 
)

Decode email from UTF-8 to local charset.

Parameters
[in]unicodetrue if Unicode is allowed
[out]sEmail to convert

Definition at line 430 of file utf7.c.

431{
432 if (!s || !*s)
433 return;
434
435 const char *c_charset = cc_charset();
436 if (!c_charset)
437 return;
438
439 if (unicode && mutt_ch_is_utf8(c_charset))
440 {
441 return;
442 }
443
444 if (!unicode)
445 {
446 char *utf8 = utf7_to_utf8(*s, strlen(*s), 0, 0);
447 FREE(s);
448 *s = utf8;
449 }
450
451 if (mutt_ch_convert_string(s, "utf-8", c_charset, MUTT_ICONV_NO_FLAGS) != 0)
452 {
453 FREE(s);
454 }
455}
static char * utf7_to_utf8(const char *u7, size_t u7len, char **u8, size_t *u8len)
Convert data from RFC2060's UTF-7 to UTF-8.
Definition: utf7.c:106
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ Index64u

const int Index64u[128]
static
Initial value:
= {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
}

Lookup table for Base64 encoding/decoding.

This is very similar to the table in lib/lib_base64.c Encoding chars: utf7 A-Za-z0-9+, mime A-Za-z0-9+/

Definition at line 66 of file utf7.c.

◆ B64Chars

const char B64Chars[64]
static
Initial value:
= {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',',
}

Characters of the Base64 encoding.

Definition at line 82 of file utf7.c.