NeoMutt  2024-10-02-58-g7720cc
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
mbyte.c File Reference

Multi-byte String manipulation functions. More...

#include "config.h"
#include <ctype.h>
#include <limits.h>
#include <stdbool.h>
#include <string.h>
#include <wchar.h>
#include <wctype.h>
#include "mbyte.h"
#include "buffer.h"
#include "charset.h"
#include "memory.h"
#include "pool.h"
#include "string2.h"
+ Include dependency graph for mbyte.c:

Go to the source code of this file.

Functions

int mutt_mb_charlen (const char *s, int *width)
 Count the bytes in a (multibyte) character.
 
bool mutt_mb_get_initials (const char *name, char *buf, size_t buflen)
 Turn a name into initials.
 
int mutt_mb_width (const char *str, int col, bool indent)
 Measure a string's display width (in screen columns)
 
int mutt_mb_wcwidth (wchar_t wc)
 Measure the screen width of a character.
 
int mutt_mb_wcswidth (const wchar_t *s, size_t n)
 Measure the screen width of a string.
 
size_t mutt_mb_width_ceiling (const wchar_t *s, size_t n, int w1)
 Keep the end of the string on-screen.
 
void buf_mb_wcstombs (struct Buffer *dest, const wchar_t *wstr, size_t wlen)
 Convert a string from wide to multibyte characters.
 
size_t mutt_mb_mbstowcs (wchar_t **pwbuf, size_t *pwbuflen, size_t i, const char *buf)
 Convert a string from multibyte to wide characters.
 
bool mutt_mb_is_shell_char (wchar_t ch)
 Is character not typically part of a pathname.
 
bool mutt_mb_is_lower (const char *s)
 Does a multi-byte string contain only lowercase characters?
 
bool mutt_mb_is_display_corrupting_utf8 (wchar_t wc)
 Will this character corrupt the display?
 
int mutt_mb_filter_unprintable (char **s)
 Replace unprintable characters.
 

Variables

bool OptLocales
 (pseudo) set if user has valid locale definition
 

Detailed Description

Multi-byte String manipulation functions.

Authors
  • Richard Russon
  • Pietro Cerutti

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file mbyte.c.

Function Documentation

◆ mutt_mb_charlen()

int mutt_mb_charlen ( const char *  s,
int *  width 
)

Count the bytes in a (multibyte) character.

Parameters
[in]sString to be examined
[out]widthNumber of screen columns the character would use
Return values
numBytes in the first (multibyte) character of input consumes
<0Conversion error
=0End of input
>0Length (bytes)

Definition at line 55 of file mbyte.c.

56{
57 if (!s || (*s == '\0'))
58 return 0;
59
60 wchar_t wc = 0;
61 mbstate_t mbstate = { 0 };
62
63 size_t n = mutt_str_len(s);
64 size_t k = mbrtowc(&wc, s, n, &mbstate);
65 if (width)
66 *width = wcwidth(wc);
67 return ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL)) ? -1 : k;
68}
#define ICONV_BUF_TOO_SMALL
Error value for iconv() - Buffer too small.
Definition: charset.h:107
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition: charset.h:105
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:496
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_get_initials()

bool mutt_mb_get_initials ( const char *  name,
char *  buf,
size_t  buflen 
)

Turn a name into initials.

Parameters
nameString to be converted
bufBuffer for the result
buflenSize of the buffer
Return values
1Success
0Failure

Take a name, e.g. "John F. Kennedy" and reduce it to initials "JFK". The function saves the first character from each word. Words are delimited by whitespace, or hyphens (so "Jean-Pierre" becomes "JP").

Definition at line 82 of file mbyte.c.

83{
84 if (!name || !buf)
85 return false;
86
87 while (*name)
88 {
89 /* Char's length in bytes */
90 int clen = mutt_mb_charlen(name, NULL);
91 if (clen < 1)
92 return false;
93
94 /* Ignore punctuation at the beginning of a word */
95 if ((clen == 1) && ispunct(*name))
96 {
97 name++;
98 continue;
99 }
100
101 if (clen >= buflen)
102 return false;
103
104 /* Copy one multibyte character */
105 buflen -= clen;
106 while (clen--)
107 *buf++ = *name++;
108
109 /* Skip to end-of-word */
110 for (; *name; name += clen)
111 {
112 clen = mutt_mb_charlen(name, NULL);
113 if (clen < 1)
114 return false;
115 if ((clen == 1) && (isspace(*name) || (*name == '-')))
116 break;
117 }
118
119 /* Skip any whitespace, or hyphens */
120 while (*name && (isspace(*name) || (*name == '-')))
121 name++;
122 }
123
124 *buf = '\0';
125 return true;
126}
int mutt_mb_charlen(const char *s, int *width)
Count the bytes in a (multibyte) character.
Definition: mbyte.c:55
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_width()

int mutt_mb_width ( const char *  str,
int  col,
bool  indent 
)

Measure a string's display width (in screen columns)

Parameters
strString to measure
colDisplay column (used for expanding tabs)
indentIf true, newline-space will be indented 8 chars
Return values
numString's width in screen columns

This is like wcwidth(), but gets const char* not wchar_t*.

Definition at line 137 of file mbyte.c.

138{
139 if (!str || !*str)
140 return 0;
141
142 bool nl = false;
143 int total_width = 0;
144 mbstate_t mbstate = { 0 };
145
146 size_t str_len = mutt_str_len(str);
147
148 while (*str && (str_len > 0))
149 {
150 wchar_t wc = L'\0';
151 size_t consumed = mbrtowc(&wc, str, str_len, &mbstate);
152 if (consumed == 0)
153 break;
154
155 if (consumed == ICONV_ILLEGAL_SEQ)
156 {
157 memset(&mbstate, 0, sizeof(mbstate));
158 wc = ReplacementChar;
159 consumed = 1;
160 }
161 else if (consumed == ICONV_BUF_TOO_SMALL)
162 {
163 wc = ReplacementChar;
164 consumed = str_len;
165 }
166
167 int wchar_width = wcwidth(wc);
168 if (wchar_width < 0)
169 wchar_width = 1;
170
171 if ((wc == L'\t') || (nl && (wc == L' ')))
172 {
173 /* correctly calc tab stop, even for sending as the line should look
174 * pretty on the receiving end */
175 nl = false;
176 wchar_width = 8 - (col % 8);
177 }
178 else if (indent && (wc == '\n'))
179 {
180 /* track newlines for display-case: if we have a space after a newline,
181 * assume 8 spaces as for display we always tab-fold */
182 nl = true;
183 }
184
185 total_width += wchar_width;
186 str += consumed;
187 str_len -= consumed;
188 }
189
190 return total_width;
191}
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:61
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_wcwidth()

int mutt_mb_wcwidth ( wchar_t  wc)

Measure the screen width of a character.

Parameters
wcCharacter to examine
Return values
numWidth in screen columns

Definition at line 198 of file mbyte.c.

199{
200 int n = wcwidth(wc);
201 if (IsWPrint(wc) && (n > 0))
202 return n;
203 if (!(wc & ~0x7f))
204 return 2;
205 if (!(wc & ~0xffff))
206 return 6;
207 return 10;
208}
#define IsWPrint(wc)
Definition: mbyte.h:41
+ Here is the caller graph for this function:

◆ mutt_mb_wcswidth()

int mutt_mb_wcswidth ( const wchar_t *  s,
size_t  n 
)

Measure the screen width of a string.

Parameters
sString to measure
nLength of string in characters
Return values
numWidth in screen columns

Definition at line 216 of file mbyte.c.

217{
218 if (!s)
219 return 0;
220
221 int w = 0;
222 while (n--)
223 w += mutt_mb_wcwidth(*s++);
224 return w;
225}
int mutt_mb_wcwidth(wchar_t wc)
Measure the screen width of a character.
Definition: mbyte.c:198
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_width_ceiling()

size_t mutt_mb_width_ceiling ( const wchar_t *  s,
size_t  n,
int  w1 
)

Keep the end of the string on-screen.

Parameters
sString being displayed
nLength of string in characters
w1Width limit
Return values
numChars to skip

Given a string and a width, determine how many characters from the beginning of the string should be skipped so that the string fits.

Definition at line 237 of file mbyte.c.

238{
239 if (!s)
240 return 0;
241
242 const wchar_t *s0 = s;
243 int w = 0;
244 for (; n; s++, n--)
245 if ((w += mutt_mb_wcwidth(*s)) > w1)
246 break;
247 return s - s0;
248}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ buf_mb_wcstombs()

void buf_mb_wcstombs ( struct Buffer dest,
const wchar_t *  wstr,
size_t  wlen 
)

Convert a string from wide to multibyte characters.

Parameters
destBuffer for the result
wstrSource wide string to convert
wlenLength of the wide string

Definition at line 256 of file mbyte.c.

257{
258 if (!dest || !wstr)
259 return;
260
261 // Give ourselves 4 utf-8 bytes per wide character
262 buf_alloc(dest, 4 * wlen);
263
264 mbstate_t mbstate = { 0 };
265 size_t k = 0;
266
267 char *buf = dest->data;
268 size_t buflen = dest->dsize;
269
270 for (; (wlen > 0) && (buflen >= MB_LEN_MAX); buf += k, buflen -= k, wstr++, wlen--)
271 {
272 k = wcrtomb(buf, *wstr, &mbstate);
273 if (k == ICONV_ILLEGAL_SEQ)
274 break;
275 if (*wstr == L'\0')
276 break;
277 }
278
279 *buf = '\0';
280 buf_fix_dptr(dest);
281}
void buf_fix_dptr(struct Buffer *buf)
Move the dptr to end of the Buffer.
Definition: buffer.c:182
void buf_alloc(struct Buffer *buf, size_t new_size)
Make sure a buffer can store at least new_size bytes.
Definition: buffer.c:337
size_t dsize
Length of data.
Definition: buffer.h:39
char * data
Pointer to data.
Definition: buffer.h:37
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_mbstowcs()

size_t mutt_mb_mbstowcs ( wchar_t **  pwbuf,
size_t *  pwbuflen,
size_t  i,
const char *  buf 
)

Convert a string from multibyte to wide characters.

Parameters
[out]pwbufBuffer for the result
[out]pwbuflenLength of the result buffer
[in]iStarting index into the result buffer
[in]bufString to convert
Return values
numFirst character after the result

Definition at line 291 of file mbyte.c.

292{
293 if (!pwbuf || !pwbuflen || !buf)
294 return 0;
295
296 wchar_t wc = 0;
297 mbstate_t mbstate = { 0 };
298 size_t k;
299 wchar_t *wbuf = *pwbuf;
300 size_t wbuflen = *pwbuflen;
301
302 while (*buf != '\0')
303 {
304 memset(&mbstate, 0, sizeof(mbstate));
305 for (; (k = mbrtowc(&wc, buf, MB_LEN_MAX, &mbstate)) &&
306 (k != ICONV_ILLEGAL_SEQ) && (k != ICONV_BUF_TOO_SMALL);
307 buf += k)
308 {
309 if (i >= wbuflen)
310 {
311 wbuflen = i + 20;
312 mutt_mem_realloc(&wbuf, wbuflen * sizeof(*wbuf));
313 }
314 wbuf[i++] = wc;
315 }
316 if ((*buf != '\0') && ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL)))
317 {
318 if (i >= wbuflen)
319 {
320 wbuflen = i + 20;
321 mutt_mem_realloc(&wbuf, wbuflen * sizeof(*wbuf));
322 }
323 wbuf[i++] = ReplacementChar;
324 buf++;
325 }
326 }
327 *pwbuf = wbuf;
328 *pwbuflen = wbuflen;
329 return i;
330}
void mutt_mem_realloc(void *ptr, size_t size)
Resize a block of memory on the heap.
Definition: memory.c:115
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_is_shell_char()

bool mutt_mb_is_shell_char ( wchar_t  ch)

Is character not typically part of a pathname.

Parameters
chCharacter to examine
Return values
trueCharacter is not typically part of a pathname
falseCharacter is typically part of a pathname
Note
The name is very confusing.

Definition at line 340 of file mbyte.c.

341{
342 static const wchar_t shell_chars[] = L"<>&()$?*;{}| "; /* ! not included because it can be part of a pathname in NeoMutt */
343 return wcschr(shell_chars, ch);
344}
+ Here is the caller graph for this function:

◆ mutt_mb_is_lower()

bool mutt_mb_is_lower ( const char *  s)

Does a multi-byte string contain only lowercase characters?

Parameters
sString to check
Return values
trueString contains no uppercase characters
falseError, or contains some uppercase characters

Non-alphabetic characters are considered lowercase.

Definition at line 354 of file mbyte.c.

355{
356 if (!s)
357 return false;
358
359 wchar_t wc = 0;
360 mbstate_t mbstate = { 0 };
361 size_t l;
362
363 memset(&mbstate, 0, sizeof(mbstate));
364 size_t n = mutt_str_len(s);
365
366 for (; (n > 0) && (*s != '\0') && (l = mbrtowc(&wc, s, n, &mbstate)) != 0; s += l, n -= l)
367 {
368 if ((l == ICONV_BUF_TOO_SMALL) || (l == ICONV_ILLEGAL_SEQ))
369 return false; // error; assume upper-case
370 if (iswalpha((wint_t) wc) && iswupper((wint_t) wc))
371 return false; // upper-case
372 }
373
374 return true; // lower-case
375}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_mb_is_display_corrupting_utf8()

bool mutt_mb_is_display_corrupting_utf8 ( wchar_t  wc)

Will this character corrupt the display?

Parameters
wcCharacter to examine
Return values
trueCharacter would corrupt the display
falseCharacter is safe to display
Note
This list isn't complete.

Definition at line 385 of file mbyte.c.

386{
387 if ((wc == (wchar_t) 0x00ad) || /* soft hyphen */
388 (wc == (wchar_t) 0x200e) || /* left-to-right mark */
389 (wc == (wchar_t) 0x200f) || /* right-to-left mark */
390 (wc == (wchar_t) 0xfeff)) /* zero width no-break space */
391 {
392 return true;
393 }
394
395 /* left-to-right isolate, right-to-left isolate, first strong isolate,
396 * pop directional isolate */
397 if ((wc >= (wchar_t) 0x2066) && (wc <= (wchar_t) 0x2069))
398 return true;
399
400 /* left-to-right embedding, right-to-left embedding, pop directional formatting,
401 * left-to-right override, right-to-left override */
402 if ((wc >= (wchar_t) 0x202a) && (wc <= (wchar_t) 0x202e))
403 return true;
404
405 /* arabic letter mark */
406 if (wc == (wchar_t) 0x061c)
407 return true;
408
409 return false;
410}
+ Here is the caller graph for this function:

◆ mutt_mb_filter_unprintable()

int mutt_mb_filter_unprintable ( char **  s)

Replace unprintable characters.

Parameters
[in,out]sString to modify
Return values
0Success
-1Error

Unprintable characters will be replaced with ReplacementChar.

Note
The source string will be freed and a newly allocated string will be returned in its place. The caller should free the returned string.

Definition at line 423 of file mbyte.c.

424{
425 if (!s || !*s)
426 return -1;
427
428 wchar_t wc = 0;
429 size_t k, k2;
430 char scratch[MB_LEN_MAX + 1];
431 char *p = *s;
432 mbstate_t mbstate1 = { 0 };
433 mbstate_t mbstate2 = { 0 };
434
435 struct Buffer *buf = buf_pool_get();
436 for (; (k = mbrtowc(&wc, p, MB_LEN_MAX, &mbstate1)); p += k)
437 {
438 if ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL))
439 {
440 k = 1;
441 memset(&mbstate1, 0, sizeof(mbstate1));
442 wc = ReplacementChar;
443 }
444 if (!IsWPrint(wc))
445 wc = '?';
447 continue;
448 k2 = wcrtomb(scratch, wc, &mbstate2);
449 scratch[k2] = '\0';
450 buf_addstr(buf, scratch);
451 }
452 FREE(s);
453
454 if (buf_is_empty(buf))
455 *s = mutt_mem_calloc(1, 1); // Fake empty string
456 else
457 *s = buf_strdup(buf);
458
459 buf_pool_release(&buf);
460 return 0;
461}
bool buf_is_empty(const struct Buffer *buf)
Is the Buffer empty?
Definition: buffer.c:291
size_t buf_addstr(struct Buffer *buf, const char *s)
Add a string to a Buffer.
Definition: buffer.c:226
char * buf_strdup(const struct Buffer *buf)
Copy a Buffer's string.
Definition: buffer.c:571
bool mutt_mb_is_display_corrupting_utf8(wchar_t wc)
Will this character corrupt the display?
Definition: mbyte.c:385
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:51
#define FREE(x)
Definition: memory.h:45
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:66
struct Buffer * buf_pool_get(void)
Get a Buffer from the pool.
Definition: pool.c:81
void buf_pool_release(struct Buffer **ptr)
Return a Buffer to the pool.
Definition: pool.c:94
String manipulation buffer.
Definition: buffer.h:36
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Variable Documentation

◆ OptLocales

bool OptLocales

(pseudo) set if user has valid locale definition

Definition at line 44 of file mbyte.c.