NeoMutt  2020-11-20
Teaching an old dog new tricks
DOXYGEN
mbyte.c File Reference

Multi-byte String manipulation functions. More...

#include "config.h"
#include <stddef.h>
#include <ctype.h>
#include <limits.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include <wctype.h>
#include "mbyte.h"
#include "buffer.h"
#include "charset.h"
#include "memory.h"
#include "string2.h"
+ Include dependency graph for mbyte.c:

Go to the source code of this file.

Functions

int mutt_mb_charlen (const char *s, int *width)
 Count the bytes in a (multibyte) character. More...
 
bool mutt_mb_get_initials (const char *name, char *buf, size_t buflen)
 Turn a name into initials. More...
 
int mutt_mb_width (const char *str, int col, bool display)
 Measure a string's display width (in screen columns) More...
 
int mutt_mb_wcwidth (wchar_t wc)
 Measure the screen width of a character. More...
 
int mutt_mb_wcswidth (const wchar_t *s, size_t n)
 Measure the screen width of a string. More...
 
size_t mutt_mb_width_ceiling (const wchar_t *s, size_t n, int w1)
 Keep the end of the string on-screen. More...
 
void mutt_mb_wcstombs (char *dest, size_t dlen, const wchar_t *src, size_t slen)
 Convert a string from wide to multibyte characters. More...
 
size_t mutt_mb_mbstowcs (wchar_t **pwbuf, size_t *pwbuflen, size_t i, char *buf)
 Convert a string from multibyte to wide characters. More...
 
bool mutt_mb_is_shell_char (wchar_t ch)
 Is character not typically part of a pathname. More...
 
bool mutt_mb_is_lower (const char *s)
 Does a multi-byte string contain only lowercase characters? More...
 
bool mutt_mb_is_display_corrupting_utf8 (wchar_t wc)
 Will this character corrupt the display? More...
 
int mutt_mb_filter_unprintable (char **s)
 Replace unprintable characters. More...
 

Variables

bool OptLocales
 (pseudo) set if user has valid locale definition More...
 

Detailed Description

Multi-byte String manipulation functions.

Authors
  • Richard Russon

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file mbyte.c.

Function Documentation

◆ mutt_mb_charlen()

int mutt_mb_charlen ( const char *  s,
int *  width 
)

Count the bytes in a (multibyte) character.

Parameters
[in]sString to be examined
[out]widthNumber of screen columns the character would use
Return values
numBytes in the first (multibyte) character of input consumes
<0Conversion error
=0End of input
>0Length (bytes)

Definition at line 55 of file mbyte.c.

56 {
57  if (!s || (*s == '\0'))
58  return 0;
59 
60  wchar_t wc;
61  mbstate_t mbstate;
62  size_t k, n;
63 
64  n = mutt_str_len(s);
65  memset(&mbstate, 0, sizeof(mbstate));
66  k = mbrtowc(&wc, s, n, &mbstate);
67  if (width)
68  *width = wcwidth(wc);
69  return ((k == (size_t)(-1)) || (k == (size_t)(-2))) ? -1 : k;
70 }
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:631
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_get_initials()

bool mutt_mb_get_initials ( const char *  name,
char *  buf,
size_t  buflen 
)

Turn a name into initials.

Parameters
nameString to be converted
bufBuffer for the result
buflenSize of the buffer
Return values
1Success
0Failure

Take a name, e.g. "John F. Kennedy" and reduce it to initials "JFK". The function saves the first character from each word. Words are delimited by whitespace, or hyphens (so "Jean-Pierre" becomes "JP").

Definition at line 84 of file mbyte.c.

85 {
86  if (!name || !buf)
87  return false;
88 
89  while (*name)
90  {
91  /* Char's length in bytes */
92  int clen = mutt_mb_charlen(name, NULL);
93  if (clen < 1)
94  return false;
95 
96  /* Ignore punctuation at the beginning of a word */
97  if ((clen == 1) && ispunct(*name))
98  {
99  name++;
100  continue;
101  }
102 
103  if (clen >= buflen)
104  return false;
105 
106  /* Copy one multibyte character */
107  buflen -= clen;
108  while (clen--)
109  *buf++ = *name++;
110 
111  /* Skip to end-of-word */
112  for (; *name; name += clen)
113  {
114  clen = mutt_mb_charlen(name, NULL);
115  if (clen < 1)
116  return false;
117  if ((clen == 1) && (isspace(*name) || (*name == '-')))
118  break;
119  }
120 
121  /* Skip any whitespace, or hyphens */
122  while (*name && (isspace(*name) || (*name == '-')))
123  name++;
124  }
125 
126  *buf = '\0';
127  return true;
128 }
int mutt_mb_charlen(const char *s, int *width)
Count the bytes in a (multibyte) character.
Definition: mbyte.c:55
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_width()

int mutt_mb_width ( const char *  str,
int  col,
bool  display 
)

Measure a string's display width (in screen columns)

Parameters
strString to measure
colDisplay column (used for expanding tabs)
displaywill this be displayed to the user?
Return values
numStrings width in screen columns

This is like wcwidth(), but gets const char* not wchar_t*.

Definition at line 139 of file mbyte.c.

140 {
141  wchar_t wc;
142  int l, w = 0, nl = 0;
143  const char *p = str;
144 
145  while (p && *p)
146  {
147  if (mbtowc(&wc, p, MB_CUR_MAX) >= 0)
148  {
149  l = wcwidth(wc);
150  if (l < 0)
151  l = 1;
152  /* correctly calc tab stop, even for sending as the
153  * line should look pretty on the receiving end */
154  if ((wc == L'\t') || (nl && (wc == L' ')))
155  {
156  nl = 0;
157  l = 8 - (col % 8);
158  }
159  /* track newlines for display-case: if we have a space
160  * after a newline, assume 8 spaces as for display we
161  * always tab-fold */
162  else if (display && (wc == '\n'))
163  nl = 1;
164  }
165  else
166  l = 1;
167  w += l;
168  p++;
169  }
170  return w;
171 }
+ Here is the caller graph for this function:

◆ mutt_mb_wcwidth()

int mutt_mb_wcwidth ( wchar_t  wc)

Measure the screen width of a character.

Parameters
wcCharacter to examine
Return values
numWidth in screen columns

Definition at line 178 of file mbyte.c.

179 {
180  int n = wcwidth(wc);
181  if (IsWPrint(wc) && (n > 0))
182  return n;
183  if (!(wc & ~0x7f))
184  return 2;
185  if (!(wc & ~0xffff))
186  return 6;
187  return 10;
188 }
#define IsWPrint(wc)
Definition: mbyte.h:40
+ Here is the caller graph for this function:

◆ mutt_mb_wcswidth()

int mutt_mb_wcswidth ( const wchar_t *  s,
size_t  n 
)

Measure the screen width of a string.

Parameters
sString to measure
nLength of string in characters
Return values
numWidth in screen columns

Definition at line 196 of file mbyte.c.

197 {
198  if (!s)
199  return 0;
200 
201  int w = 0;
202  while (n--)
203  w += mutt_mb_wcwidth(*s++);
204  return w;
205 }
int mutt_mb_wcwidth(wchar_t wc)
Measure the screen width of a character.
Definition: mbyte.c:178
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_width_ceiling()

size_t mutt_mb_width_ceiling ( const wchar_t *  s,
size_t  n,
int  w1 
)

Keep the end of the string on-screen.

Parameters
sString being displayed
nLength of string in characters
w1Width limit
Return values
numChars to skip

Given a string and a width, determine how many characters from the beginning of the string should be skipped so that the string fits.

Definition at line 217 of file mbyte.c.

218 {
219  if (!s)
220  return 0;
221 
222  const wchar_t *s0 = s;
223  int w = 0;
224  for (; n; s++, n--)
225  if ((w += mutt_mb_wcwidth(*s)) > w1)
226  break;
227  return s - s0;
228 }
int mutt_mb_wcwidth(wchar_t wc)
Measure the screen width of a character.
Definition: mbyte.c:178
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_wcstombs()

void mutt_mb_wcstombs ( char *  dest,
size_t  dlen,
const wchar_t *  src,
size_t  slen 
)

Convert a string from wide to multibyte characters.

Parameters
destBuffer for the result
dlenLength of the result buffer
srcSource string to convert
slenLength of the source string

Definition at line 237 of file mbyte.c.

238 {
239  if (!dest || !src)
240  return;
241 
242  mbstate_t st;
243  size_t k;
244 
245  /* First convert directly into the destination buffer */
246  memset(&st, 0, sizeof(st));
247  for (; slen && dlen >= MB_LEN_MAX; dest += k, dlen -= k, src++, slen--)
248  {
249  k = wcrtomb(dest, *src, &st);
250  if (k == (size_t)(-1))
251  break;
252  }
253 
254  /* If this works, we can stop now */
255  if (dlen >= MB_LEN_MAX)
256  {
257  dest += wcrtomb(dest, 0, &st);
258  return;
259  }
260 
261  /* Otherwise convert any remaining data into a local buffer */
262  {
263  char buf[3 * MB_LEN_MAX];
264  char *p = buf;
265 
266  for (; slen && p - buf < dlen; p += k, src++, slen--)
267  {
268  k = wcrtomb(p, *src, &st);
269  if (k == (size_t)(-1))
270  break;
271  }
272  p += wcrtomb(p, 0, &st);
273 
274  /* If it fits into the destination buffer, we can stop now */
275  if (p - buf <= dlen)
276  {
277  memcpy(dest, buf, p - buf);
278  return;
279  }
280 
281  /* Otherwise we truncate the string in an ugly fashion */
282  memcpy(dest, buf, dlen);
283  dest[dlen - 1] = '\0'; /* assume original dlen > 0 */
284  }
285 }
+ Here is the caller graph for this function:

◆ mutt_mb_mbstowcs()

size_t mutt_mb_mbstowcs ( wchar_t **  pwbuf,
size_t *  pwbuflen,
size_t  i,
char *  buf 
)

Convert a string from multibyte to wide characters.

Parameters
[out]pwbufBuffer for the result
[out]pwbuflenLength of the result buffer
[in]iStarting index into the result buffer
[in]bufString to convert
Return values
numFirst character after the result

Definition at line 295 of file mbyte.c.

296 {
297  if (!pwbuf || !pwbuflen || !buf)
298  return 0;
299 
300  wchar_t wc;
301  mbstate_t st;
302  size_t k;
303  wchar_t *wbuf = *pwbuf;
304  size_t wbuflen = *pwbuflen;
305 
306  while (*buf != '\0')
307  {
308  memset(&st, 0, sizeof(st));
309  for (; (k = mbrtowc(&wc, buf, MB_LEN_MAX, &st)) && k != (size_t)(-1) &&
310  k != (size_t)(-2);
311  buf += k)
312  {
313  if (i >= wbuflen)
314  {
315  wbuflen = i + 20;
316  mutt_mem_realloc(&wbuf, wbuflen * sizeof(*wbuf));
317  }
318  wbuf[i++] = wc;
319  }
320  if ((*buf != '\0') && ((k == (size_t) -1) || (k == (size_t) -2)))
321  {
322  if (i >= wbuflen)
323  {
324  wbuflen = i + 20;
325  mutt_mem_realloc(&wbuf, wbuflen * sizeof(*wbuf));
326  }
327  wbuf[i++] = ReplacementChar;
328  buf++;
329  }
330  }
331  *pwbuf = wbuf;
332  *pwbuflen = wbuflen;
333  return i;
334 }
void mutt_mem_realloc(void *ptr, size_t size)
Resize a block of memory on the heap.
Definition: memory.c:114
wchar_t ReplacementChar
When a Unicode character can&#39;t be displayed, use this instead.
Definition: charset.c:58
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_is_shell_char()

bool mutt_mb_is_shell_char ( wchar_t  ch)

Is character not typically part of a pathname.

Parameters
chCharacter to examine
Return values
trueCharacter is not typically part of a pathname
falseCharacter is typically part of a pathname
Note
The name is very confusing.

Definition at line 344 of file mbyte.c.

345 {
346  static const wchar_t shell_chars[] = L"<>&()$?*;{}| "; /* ! not included because it can be part of a pathname in NeoMutt */
347  return wcschr(shell_chars, ch);
348 }
+ Here is the caller graph for this function:

◆ mutt_mb_is_lower()

bool mutt_mb_is_lower ( const char *  s)

Does a multi-byte string contain only lowercase characters?

Parameters
sString to check
Return values
trueString contains no uppercase characters
falseError, or contains some uppercase characters

Non-alphabetic characters are considered lowercase.

Definition at line 358 of file mbyte.c.

359 {
360  if (!s)
361  return false;
362 
363  wchar_t w;
364  mbstate_t mb;
365  size_t l;
366 
367  memset(&mb, 0, sizeof(mb));
368 
369  for (; (l = mbrtowc(&w, s, MB_CUR_MAX, &mb)) != 0; s += l)
370  {
371  if (l == (size_t) -2)
372  continue; /* shift sequences */
373  if (l == (size_t) -1)
374  return false;
375  if (iswalpha((wint_t) w) && iswupper((wint_t) w))
376  return false;
377  }
378 
379  return true;
380 }
+ Here is the caller graph for this function:

◆ mutt_mb_is_display_corrupting_utf8()

bool mutt_mb_is_display_corrupting_utf8 ( wchar_t  wc)

Will this character corrupt the display?

Parameters
wcCharacter to examine
Return values
trueCharacter would corrupt the display
falseCharacter is safe to display
Note
This list isn't complete.

Definition at line 390 of file mbyte.c.

391 {
392  if ((wc == (wchar_t) 0x00ad) || /* soft hyphen */
393  (wc == (wchar_t) 0x200e) || /* left-to-right mark */
394  (wc == (wchar_t) 0x200f) || /* right-to-left mark */
395  (wc == (wchar_t) 0xfeff)) /* zero width no-break space */
396  {
397  return true;
398  }
399 
400  /* left-to-right isolate, right-to-left isolate, first strong isolate,
401  * pop directional isolate */
402  if ((wc >= (wchar_t) 0x2066) && (wc <= (wchar_t) 0x2069))
403  return true;
404 
405  /* left-to-right embedding, right-to-left embedding, pop directional formatting,
406  * left-to-right override, right-to-left override */
407  if ((wc >= (wchar_t) 0x202a) && (wc <= (wchar_t) 0x202e))
408  return true;
409 
410  return false;
411 }
+ Here is the caller graph for this function:

◆ mutt_mb_filter_unprintable()

int mutt_mb_filter_unprintable ( char **  s)

Replace unprintable characters.

Parameters
[in,out]sString to modify
Return values
0Success
-1Error

Unprintable characters will be replaced with ReplacementChar.

Note
The source string will be freed and a newly allocated string will be returned in its place. The caller should free the returned string.

Definition at line 424 of file mbyte.c.

425 {
426  if (!s || !*s)
427  return -1;
428 
429  wchar_t wc;
430  size_t k, k2;
431  char scratch[MB_LEN_MAX + 1];
432  char *p = *s;
433  mbstate_t mbstate1, mbstate2;
434 
435  struct Buffer buf = mutt_buffer_make(0);
436  memset(&mbstate1, 0, sizeof(mbstate1));
437  memset(&mbstate2, 0, sizeof(mbstate2));
438  for (; (k = mbrtowc(&wc, p, MB_LEN_MAX, &mbstate1)); p += k)
439  {
440  if ((k == (size_t) -1) || (k == (size_t) -2))
441  {
442  k = 1;
443  memset(&mbstate1, 0, sizeof(mbstate1));
444  wc = ReplacementChar;
445  }
446  if (!IsWPrint(wc))
447  wc = '?';
449  continue;
450  k2 = wcrtomb(scratch, wc, &mbstate2);
451  scratch[k2] = '\0';
452  mutt_buffer_addstr(&buf, scratch);
453  }
454  FREE(s);
455  *s = buf.data ? buf.data : mutt_mem_calloc(1, 1);
456  return 0;
457 }
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
bool mutt_mb_is_display_corrupting_utf8(wchar_t wc)
Will this character corrupt the display?
Definition: mbyte.c:390
#define IsWPrint(wc)
Definition: mbyte.h:40
struct Buffer mutt_buffer_make(size_t size)
Make a new buffer on the stack.
Definition: buffer.c:61
String manipulation buffer.
Definition: buffer.h:33
size_t mutt_buffer_addstr(struct Buffer *buf, const char *s)
Add a string to a Buffer.
Definition: buffer.c:225
char * data
Pointer to data.
Definition: buffer.h:35
wchar_t ReplacementChar
When a Unicode character can&#39;t be displayed, use this instead.
Definition: charset.c:58
bool CharsetIsUtf8
Is the user&#39;s current character set utf-8?
Definition: charset.c:63
#define FREE(x)
Definition: memory.h:40
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ OptLocales

bool OptLocales

(pseudo) set if user has valid locale definition

Definition at line 44 of file mbyte.c.