NeoMutt  2020-08-07-1-gab41a1
Teaching an old dog new tricks
DOXYGEN
utf7.c File Reference

Convert strings to/from utf7/utf8. More...

#include "config.h"
#include <stdbool.h>
#include <string.h>
#include "private.h"
#include "mutt/lib.h"
+ Include dependency graph for utf7.c:

Go to the source code of this file.

Functions

static char * utf7_to_utf8 (const char *u7, size_t u7len, char **u8, size_t *u8len)
 Convert data from RFC2060's UTF-7 to UTF-8. More...
 
static char * utf8_to_utf7 (const char *u8, size_t u8len, char **u7, size_t *u7len)
 Convert data from UTF-8 to RFC2060's UTF-7. More...
 
void imap_utf_encode (bool unicode, char **s)
 Encode email from local charset to UTF-8. More...
 
void imap_utf_decode (bool unicode, char **s)
 Decode email from UTF-8 to local charset. More...
 

Variables

const int Index64u [128]
 Lookup table for Base64 encoding/decoding. More...
 
static const char B64Chars [64]
 Characters of the Base64 encoding. More...
 

Detailed Description

Convert strings to/from utf7/utf8.

Authors
  • Edmund Grimley Evans

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file utf7.c.

Function Documentation

◆ utf7_to_utf8()

static char* utf7_to_utf8 ( const char *  u7,
size_t  u7len,
char **  u8,
size_t *  u8len 
)
static

Convert data from RFC2060's UTF-7 to UTF-8.

Parameters
[in]u7UTF-7 data
[in]u7lenLength of UTF-7 data
[out]u8Save the UTF-8 data pointer
[out]u8lenSave the UTF-8 data length
Return values
ptrUTF-8 data
NULLError

RFC2060 obviously intends the encoding to be unique (see point 5 in section 5.1.3), so we reject any non-canonical form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead of &AMAAwA-).

Note
The result is null-terminated.
The caller must free() the returned data.

Definition at line 83 of file utf7.c.

84 {
85  int b, ch, k;
86 
87  char *buf = mutt_mem_malloc(u7len + u7len / 8 + 1);
88  char *p = buf;
89 
90  for (; u7len; u7++, u7len--)
91  {
92  if (*u7 == '&')
93  {
94  u7++;
95  u7len--;
96 
97  if (u7len && (*u7 == '-'))
98  {
99  *p++ = '&';
100  continue;
101  }
102 
103  ch = 0;
104  k = 10;
105  for (; u7len; u7++, u7len--)
106  {
107  if ((*u7 & 0x80) || ((b = Index64u[(int) *u7]) == -1))
108  break;
109  if (k > 0)
110  {
111  ch |= b << k;
112  k -= 6;
113  }
114  else
115  {
116  ch |= b >> (-k);
117  if (ch < 0x80)
118  {
119  if ((0x20 <= ch) && (ch < 0x7f))
120  {
121  /* Printable US-ASCII */
122  goto bail;
123  }
124  *p++ = ch;
125  }
126  else if (ch < 0x800)
127  {
128  *p++ = 0xc0 | (ch >> 6);
129  *p++ = 0x80 | (ch & 0x3f);
130  }
131  else
132  {
133  *p++ = 0xe0 | (ch >> 12);
134  *p++ = 0x80 | ((ch >> 6) & 0x3f);
135  *p++ = 0x80 | (ch & 0x3f);
136  }
137  ch = (b << (16 + k)) & 0xffff;
138  k += 10;
139  }
140  }
141  if (ch || (k < 6))
142  {
143  /* Non-zero or too many extra bits */
144  goto bail;
145  }
146  if (!u7len || (*u7 != '-'))
147  {
148  /* BASE64 not properly terminated */
149  goto bail;
150  }
151  if ((u7len > 2) && (u7[1] == '&') && (u7[2] != '-'))
152  {
153  /* Adjacent BASE64 sections */
154  goto bail;
155  }
156  }
157  else if ((*u7 < 0x20) || (*u7 >= 0x7f))
158  {
159  /* Not printable US-ASCII */
160  goto bail;
161  }
162  else
163  *p++ = *u7;
164  }
165  *p++ = '\0';
166  if (u8len)
167  *u8len = p - buf;
168 
169  mutt_mem_realloc(&buf, p - buf);
170  if (u8)
171  *u8 = buf;
172  return buf;
173 
174 bail:
175  FREE(&buf);
176  return NULL;
177 }
const int Index64u[128]
Lookup table for Base64 encoding/decoding.
Definition: utf7.c:44
void mutt_mem_realloc(void *ptr, size_t size)
Resize a block of memory on the heap.
Definition: memory.c:114
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
#define FREE(x)
Definition: memory.h:40
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ utf8_to_utf7()

static char* utf8_to_utf7 ( const char *  u8,
size_t  u8len,
char **  u7,
size_t *  u7len 
)
static

Convert data from UTF-8 to RFC2060's UTF-7.

Parameters
[in]u8UTF-8 data
[in]u8lenLength of UTF-8 data
[out]u7Save the UTF-7 data pointer
[out]u7lenSave the UTF-7 data length
Return values
ptrUTF-7 data
NULLError

Unicode characters above U+FFFF are replaced by U+FFFE.

Note
The result is null-terminated.
The caller must free() the returned data.

Definition at line 193 of file utf7.c.

194 {
195  int ch;
196  int n, b = 0, k = 0;
197  bool base64 = false;
198 
199  /* In the worst case we convert 2 chars to 7 chars. For example:
200  * "\x10&\x10&..." -> "&ABA-&-&ABA-&-...". */
201  char *buf = mutt_mem_malloc((u8len / 2) * 7 + 6);
202  char *p = buf;
203 
204  while (u8len)
205  {
206  unsigned char c = *u8;
207 
208  if (c < 0x80)
209  {
210  ch = c;
211  n = 0;
212  }
213  else if (c < 0xc2)
214  goto bail;
215  else if (c < 0xe0)
216  {
217  ch = c & 0x1f;
218  n = 1;
219  }
220  else if (c < 0xf0)
221  {
222  ch = c & 0x0f;
223  n = 2;
224  }
225  else if (c < 0xf8)
226  {
227  ch = c & 0x07;
228  n = 3;
229  }
230  else if (c < 0xfc)
231  {
232  ch = c & 0x03;
233  n = 4;
234  }
235  else if (c < 0xfe)
236  {
237  ch = c & 0x01;
238  n = 5;
239  }
240  else
241  goto bail;
242 
243  u8++;
244  u8len--;
245  if (n > u8len)
246  goto bail;
247  for (int i = 0; i < n; i++)
248  {
249  if ((u8[i] & 0xc0) != 0x80)
250  goto bail;
251  ch = (ch << 6) | (u8[i] & 0x3f);
252  }
253  if ((n > 1) && !(ch >> (n * 5 + 1)))
254  goto bail;
255  u8 += n;
256  u8len -= n;
257 
258  if ((ch < 0x20) || (ch >= 0x7f))
259  {
260  if (!base64)
261  {
262  *p++ = '&';
263  base64 = true;
264  b = 0;
265  k = 10;
266  }
267  if (ch & ~0xffff)
268  ch = 0xfffe;
269  *p++ = B64Chars[b | ch >> k];
270  k -= 6;
271  for (; k >= 0; k -= 6)
272  *p++ = B64Chars[(ch >> k) & 0x3f];
273  b = (ch << (-k)) & 0x3f;
274  k += 16;
275  }
276  else
277  {
278  if (base64)
279  {
280  if (k > 10)
281  *p++ = B64Chars[b];
282  *p++ = '-';
283  base64 = false;
284  }
285  *p++ = ch;
286  if (ch == '&')
287  *p++ = '-';
288  }
289  }
290 
291  if (base64)
292  {
293  if (k > 10)
294  *p++ = B64Chars[b];
295  *p++ = '-';
296  }
297 
298  *p++ = '\0';
299  if (u7len)
300  *u7len = p - buf;
301  mutt_mem_realloc(&buf, p - buf);
302  if (u7)
303  *u7 = buf;
304  return buf;
305 
306 bail:
307  FREE(&buf);
308  return NULL;
309 }
static const char B64Chars[64]
Characters of the Base64 encoding.
Definition: utf7.c:59
void mutt_mem_realloc(void *ptr, size_t size)
Resize a block of memory on the heap.
Definition: memory.c:114
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:90
int n
Definition: acutest.h:492
#define FREE(x)
Definition: memory.h:40
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ imap_utf_encode()

void imap_utf_encode ( bool  unicode,
char **  s 
)

Encode email from local charset to UTF-8.

Parameters
[in]unicodetrue if Unicode is allowed
[out]sEmail to convert

Definition at line 316 of file utf7.c.

317 {
318  if (!C_Charset || !s || !*s)
319  return;
320 
321  if (unicode && mutt_ch_is_utf8(C_Charset))
322  {
323  return;
324  }
325 
326  if (mutt_ch_convert_string(s, C_Charset, "utf-8", 0) != 0)
327  {
328  FREE(s);
329  return;
330  }
331 
332  if (!unicode)
333  {
334  char *utf7 = utf8_to_utf7(*s, strlen(*s), NULL, 0);
335  FREE(s);
336  *s = utf7;
337  }
338 }
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
int mutt_ch_convert_string(char **ps, const char *from, const char *to, int flags)
Convert a string between encodings.
Definition: charset.c:754
static char * utf8_to_utf7(const char *u8, size_t u8len, char **u7, size_t *u7len)
Convert data from UTF-8 to RFC2060&#39;s UTF-7.
Definition: utf7.c:193
#define FREE(x)
Definition: memory.h:40
char * C_Charset
Config: Default character set for displaying text on screen.
Definition: charset.c:53
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ imap_utf_decode()

void imap_utf_decode ( bool  unicode,
char **  s 
)

Decode email from UTF-8 to local charset.

Parameters
[in]unicodetrue if Unicode is allowed
[out]sEmail to convert

Definition at line 345 of file utf7.c.

346 {
347  if (!C_Charset || !s || !*s)
348  return;
349 
350  if (unicode && mutt_ch_is_utf8(C_Charset))
351  {
352  return;
353  }
354 
355  if (!unicode)
356  {
357  char *utf8 = utf7_to_utf8(*s, strlen(*s), 0, 0);
358  FREE(s);
359  *s = utf8;
360  }
361 
362  if (mutt_ch_convert_string(s, "utf-8", C_Charset, 0) != 0)
363  {
364  FREE(s);
365  }
366 }
#define mutt_ch_is_utf8(str)
Definition: charset.h:95
int mutt_ch_convert_string(char **ps, const char *from, const char *to, int flags)
Convert a string between encodings.
Definition: charset.c:754
static char * utf7_to_utf8(const char *u7, size_t u7len, char **u8, size_t *u8len)
Convert data from RFC2060&#39;s UTF-7 to UTF-8.
Definition: utf7.c:83
#define FREE(x)
Definition: memory.h:40
char * C_Charset
Config: Default character set for displaying text on screen.
Definition: charset.c:53
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ Index64u

const int Index64u[128]
Initial value:
= {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
}

Lookup table for Base64 encoding/decoding.

This is very similar to the table in lib/lib_base64.c Encoding chars: utf7 A-Za-z0-9+, mime A-Za-z0-9+/

Definition at line 44 of file utf7.c.

◆ B64Chars

const char B64Chars[64]
static
Initial value:
= {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',',
}

Characters of the Base64 encoding.

Definition at line 59 of file utf7.c.