NeoMutt  2021-10-29-43-g6b8931
Teaching an old dog new tricks
DOXYGEN
utf7.c File Reference

Convert strings to/from utf7/utf8. More...

#include "config.h"
#include <stdbool.h>
#include <string.h>
#include "private.h"
#include "mutt/lib.h"
#include "config/lib.h"
#include "core/lib.h"
+ Include dependency graph for utf7.c:

Go to the source code of this file.

Functions

static char * utf7_to_utf8 (const char *u7, size_t u7len, char **u8, size_t *u8len)
 Convert data from RFC2060's UTF-7 to UTF-8. More...
 
static char * utf8_to_utf7 (const char *u8, size_t u8len, char **u7, size_t *u7len)
 Convert data from UTF-8 to RFC2060's UTF-7. More...
 
void imap_utf_encode (bool unicode, char **s)
 Encode email from local charset to UTF-8. More...
 
void imap_utf_decode (bool unicode, char **s)
 Decode email from UTF-8 to local charset. More...
 

Variables

const int Index64u [128]
 Lookup table for Base64 encoding/decoding. More...
 
static const char B64Chars [64]
 Characters of the Base64 encoding. More...
 

Detailed Description

Convert strings to/from utf7/utf8.

Authors
  • Edmund Grimley Evans

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file utf7.c.

Function Documentation

◆ utf7_to_utf8()

static char* utf7_to_utf8 ( const char *  u7,
size_t  u7len,
char **  u8,
size_t *  u8len 
)
static

Convert data from RFC2060's UTF-7 to UTF-8.

Parameters
[in]u7UTF-7 data
[in]u7lenLength of UTF-7 data
[out]u8Save the UTF-8 data pointer
[out]u8lenSave the UTF-8 data length
Return values
ptrUTF-8 data
NULLError

RFC2060 obviously intends the encoding to be unique (see point 5 in section 5.1.3), so we reject any non-canonical form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead of &AMAAwA-).

Note
The result is null-terminated.
The caller must free() the returned data.

Definition at line 85 of file utf7.c.

86 {
87  int b, ch, k;
88 
89  char *buf = mutt_mem_malloc(u7len + u7len / 8 + 1);
90  char *p = buf;
91 
92  for (; u7len; u7++, u7len--)
93  {
94  if (*u7 == '&')
95  {
96  u7++;
97  u7len--;
98 
99  if (u7len && (*u7 == '-'))
100  {
101  *p++ = '&';
102  continue;
103  }
104 
105  ch = 0;
106  k = 10;
107  for (; u7len; u7++, u7len--)
108  {
109  if ((*u7 & 0x80) || ((b = Index64u[(int) *u7]) == -1))
110  break;
111  if (k > 0)
112  {
113  ch |= b << k;
114  k -= 6;
115  }
116  else
117  {
118  ch |= b >> (-k);
119  if (ch < 0x80)
120  {
121  if ((0x20 <= ch) && (ch < 0x7f))
122  {
123  /* Printable US-ASCII */
124  goto bail;
125  }
126  *p++ = ch;
127  }
128  else if (ch < 0x800)
129  {
130  *p++ = 0xc0 | (ch >> 6);
131  *p++ = 0x80 | (ch & 0x3f);
132  }
133  else
134  {
135  *p++ = 0xe0 | (ch >> 12);
136  *p++ = 0x80 | ((ch >> 6) & 0x3f);
137  *p++ = 0x80 | (ch & 0x3f);
138  }
139  ch = (b << (16 + k)) & 0xffff;
140  k += 10;
141  }
142  }
143  if (ch || (k < 6))
144  {
145  /* Non-zero or too many extra bits */
146  goto bail;
147  }
148  if (!u7len || (*u7 != '-'))
149  {
150  /* BASE64 not properly terminated */
151  goto bail;
152  }
153  if ((u7len > 2) && (u7[1] == '&') && (u7[2] != '-'))
154  {
155  /* Adjacent BASE64 sections */
156  goto bail;
157  }
158  }
159  else if ((*u7 < 0x20) || (*u7 >= 0x7f))
160  {
161  /* Not printable US-ASCII */
162  goto bail;
163  }
164  else
165  *p++ = *u7;
166  }
167  *p++ = '\0';
168  if (u8len)
169  *u8len = p - buf;
170 
171  mutt_mem_realloc(&buf, p - buf);
172  if (u8)
173  *u8 = buf;
174  return buf;
175 
176 bail:
177  FREE(&buf);
178  return NULL;
179 }
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
void mutt_mem_realloc(void *ptr, size_t size)
Resize a block of memory on the heap.
Definition: memory.c:114
#define FREE(x)
Definition: memory.h:40
const int Index64u[128]
Lookup table for Base64 encoding/decoding.
Definition: utf7.c:45
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ utf8_to_utf7()

static char* utf8_to_utf7 ( const char *  u8,
size_t  u8len,
char **  u7,
size_t *  u7len 
)
static

Convert data from UTF-8 to RFC2060's UTF-7.

Parameters
[in]u8UTF-8 data
[in]u8lenLength of UTF-8 data
[out]u7Save the UTF-7 data pointer
[out]u7lenSave the UTF-7 data length
Return values
ptrUTF-7 data
NULLError

Unicode characters above U+FFFF are replaced by U+FFFE.

Note
The result is null-terminated.
The caller must free() the returned data.

Definition at line 195 of file utf7.c.

196 {
197  int ch;
198  int n, b = 0, k = 0;
199  bool base64 = false;
200 
201  /* In the worst case we convert 2 chars to 7 chars. For example:
202  * "\x10&\x10&..." -> "&ABA-&-&ABA-&-...". */
203  char *buf = mutt_mem_malloc((u8len / 2) * 7 + 6);
204  char *p = buf;
205 
206  while (u8len)
207  {
208  unsigned char c = *u8;
209 
210  if (c < 0x80)
211  {
212  ch = c;
213  n = 0;
214  }
215  else if (c < 0xc2)
216  goto bail;
217  else if (c < 0xe0)
218  {
219  ch = c & 0x1f;
220  n = 1;
221  }
222  else if (c < 0xf0)
223  {
224  ch = c & 0x0f;
225  n = 2;
226  }
227  else if (c < 0xf8)
228  {
229  ch = c & 0x07;
230  n = 3;
231  }
232  else if (c < 0xfc)
233  {
234  ch = c & 0x03;
235  n = 4;
236  }
237  else if (c < 0xfe)
238  {
239  ch = c & 0x01;
240  n = 5;
241  }
242  else
243  goto bail;
244 
245  u8++;
246  u8len--;
247  if (n > u8len)
248  goto bail;
249  for (int i = 0; i < n; i++)
250  {
251  if ((u8[i] & 0xc0) != 0x80)
252  goto bail;
253  ch = (ch << 6) | (u8[i] & 0x3f);
254  }
255  if ((n > 1) && !(ch >> (n * 5 + 1)))
256  goto bail;
257  u8 += n;
258  u8len -= n;
259 
260  if ((ch < 0x20) || (ch >= 0x7f))
261  {
262  if (!base64)
263  {
264  *p++ = '&';
265  base64 = true;
266  b = 0;
267  k = 10;
268  }
269  if (ch & ~0xffff)
270  ch = 0xfffe;
271  *p++ = B64Chars[b | ch >> k];
272  k -= 6;
273  for (; k >= 0; k -= 6)
274  *p++ = B64Chars[(ch >> k) & 0x3f];
275  b = (ch << (-k)) & 0x3f;
276  k += 16;
277  }
278  else
279  {
280  if (base64)
281  {
282  if (k > 10)
283  *p++ = B64Chars[b];
284  *p++ = '-';
285  base64 = false;
286  }
287  *p++ = ch;
288  if (ch == '&')
289  *p++ = '-';
290  }
291  }
292 
293  if (base64)
294  {
295  if (k > 10)
296  *p++ = B64Chars[b];
297  *p++ = '-';
298  }
299 
300  *p++ = '\0';
301  if (u7len)
302  *u7len = p - buf;
303  mutt_mem_realloc(&buf, p - buf);
304  if (u7)
305  *u7 = buf;
306  return buf;
307 
308 bail:
309  FREE(&buf);
310  return NULL;
311 }
static const char B64Chars[64]
Characters of the Base64 encoding.
Definition: utf7.c:61
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ imap_utf_encode()

void imap_utf_encode ( bool  unicode,
char **  s 
)

Encode email from local charset to UTF-8.

Parameters
[in]unicodetrue if Unicode is allowed
[out]sEmail to convert

Definition at line 318 of file utf7.c.

319 {
320  const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
321  if (!c_charset || !s || !*s)
322  return;
323 
324  if (unicode && mutt_ch_is_utf8(c_charset))
325  {
326  return;
327  }
328 
329  if (mutt_ch_convert_string(s, c_charset, "utf-8", MUTT_ICONV_NO_FLAGS) != 0)
330  {
331  FREE(s);
332  return;
333  }
334 
335  if (!unicode)
336  {
337  char *utf7 = utf8_to_utf7(*s, strlen(*s), NULL, 0);
338  FREE(s);
339  *s = utf7;
340  }
341 }
const char * cs_subset_string(const struct ConfigSubset *sub, const char *name)
Get a string config item by name.
Definition: helpers.c:317
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:765
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:71
Container for Accounts, Notifications.
Definition: neomutt.h:37
struct ConfigSubset * sub
Inherited config items.
Definition: neomutt.h:39
static char * utf8_to_utf7(const char *u8, size_t u8len, char **u7, size_t *u7len)
Convert data from UTF-8 to RFC2060's UTF-7.
Definition: utf7.c:195
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ imap_utf_decode()

void imap_utf_decode ( bool  unicode,
char **  s 
)

Decode email from UTF-8 to local charset.

Parameters
[in]unicodetrue if Unicode is allowed
[out]sEmail to convert

Definition at line 348 of file utf7.c.

349 {
350  const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
351  if (!c_charset || !s || !*s)
352  return;
353 
354  if (unicode && mutt_ch_is_utf8(c_charset))
355  {
356  return;
357  }
358 
359  if (!unicode)
360  {
361  char *utf8 = utf7_to_utf8(*s, strlen(*s), 0, 0);
362  FREE(s);
363  *s = utf8;
364  }
365 
366  if (mutt_ch_convert_string(s, "utf-8", c_charset, MUTT_ICONV_NO_FLAGS) != 0)
367  {
368  FREE(s);
369  }
370 }
static char * utf7_to_utf8(const char *u7, size_t u7len, char **u8, size_t *u8len)
Convert data from RFC2060's UTF-7 to UTF-8.
Definition: utf7.c:85
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ Index64u

const int Index64u[128]
Initial value:
= {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
}

Lookup table for Base64 encoding/decoding.

This is very similar to the table in lib/lib_base64.c Encoding chars: utf7 A-Za-z0-9+, mime A-Za-z0-9+/

Definition at line 45 of file utf7.c.

◆ B64Chars

const char B64Chars[64]
static
Initial value:
= {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',',
}

Characters of the Base64 encoding.

Definition at line 61 of file utf7.c.