Multi-byte String manipulation functions. More...

#include "config.h"
#include <ctype.h>
#include <limits.h>
#include <stdbool.h>
#include <string.h>
#include <wchar.h>
#include <wctype.h>
#include "mbyte.h"
#include "buffer.h"
#include "charset.h"
#include "memory.h"
#include "pool.h"
#include "string2.h"

Include dependency graph for mbyte.c:

Go to the source code of this file.

Functions
int	mutt_mb_charlen (const char s, int width)
	Count the bytes in a (multibyte) character.

bool	mutt_mb_get_initials (const char name, char buf, size_t buflen)
	Turn a name into initials.

int	mutt_mb_width (const char *str, int col, bool indent)
	Measure a string's display width (in screen columns)

int	mutt_mb_wcwidth (wchar_t wc)
	Measure the screen width of a character.

int	mutt_mb_wcswidth (const wchar_t *s, size_t n)
	Measure the screen width of a string.

size_t	mutt_mb_width_ceiling (const wchar_t *s, size_t n, int w1)
	Keep the end of the string on-screen.

void	buf_mb_wcstombs (struct Buffer dest, const wchar_t wstr, size_t wlen)
	Convert a string from wide to multibyte characters.

size_t	mutt_mb_mbstowcs (wchar_t *pwbuf, size_t pwbuflen, size_t i, const char *buf)
	Convert a string from multibyte to wide characters.

bool	mutt_mb_is_shell_char (wchar_t ch)
	Is character not typically part of a pathname.

bool	mutt_mb_is_lower (const char *s)
	Does a multi-byte string contain only lowercase characters?

bool	mutt_mb_is_display_corrupting_utf8 (wchar_t wc)
	Will this character corrupt the display?

int	mutt_mb_filter_unprintable (char **s)
	Replace unprintable characters.

Variables
bool	OptLocales
	(pseudo) set if user has valid locale definition

Detailed Description

Multi-byte String manipulation functions.

Authors

Richard Russon
Pietro Cerutti

Copyright: This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file mbyte.c.

Function Documentation

◆ mutt_mb_charlen()

int mutt_mb_charlen	(	const char *	s,
		int *	width
	)

Count the bytes in a (multibyte) character.

Parameters

[in]	s	String to be examined
[out]	width	Number of screen columns the character would use

Return values

num	Bytes in the first (multibyte) character of input consumes
<0	Conversion error
=0	End of input
>0	Length (bytes)

Definition at line 55 of file mbyte.c.

{
  if (!s || (*s == '\0'))
    return 0;
 
  wchar_t wc = 0;
  mbstate_t mbstate = { 0 };
 
  size_t n = mutt_str_len(s);
  size_t k = mbrtowc(&wc, s, n, &mbstate);
  if (width)
    *width = wcwidth(wc);
  return ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL)) ? -1 : k;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_mb_get_initials()

bool mutt_mb_get_initials	(	const char *	name,
		char *	buf,
		size_t	buflen
	)

Turn a name into initials.

Parameters

name	String to be converted
buf	Buffer for the result
buflen	Size of the buffer

Return values

1	Success
0	Failure

Take a name, e.g. "John F. Kennedy" and reduce it to initials "JFK". The function saves the first character from each word. Words are delimited by whitespace, or hyphens (so "Jean-Pierre" becomes "JP").

Definition at line 82 of file mbyte.c.

{
  if (!name || !buf)
    return false;
 
  while (*name)
  {
    /* Char's length in bytes */
    int clen = mutt_mb_charlen(name, NULL);
    if (clen < 1)
      return false;
 
    /* Ignore punctuation at the beginning of a word */
    if ((clen == 1) && ispunct(*name))
    {
      name++;
      continue;
    }
 
    if (clen >= buflen)
      return false;
 
    /* Copy one multibyte character */
    buflen -= clen;
    while (clen--)
      *buf++ = *name++;
 
    /* Skip to end-of-word */
    for (; *name; name += clen)
    {
      clen = mutt_mb_charlen(name, NULL);
      if (clen < 1)
        return false;
      if ((clen == 1) && (isspace(*name) || (*name == '-')))
        break;
    }
 
    /* Skip any whitespace, or hyphens */
    while (*name && (isspace(*name) || (*name == '-')))
      name++;
  }
 
  *buf = '\0';
  return true;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_mb_width()

int mutt_mb_width	(	const char *	str,
		int	col,
		bool	indent
	)

Measure a string's display width (in screen columns)

Parameters

str	String to measure
col	Display column (used for expanding tabs)
indent	If true, newline-space will be indented 8 chars

Return values

num	String's width in screen columns

This is like wcwidth(), but gets const char* not wchar_t*.

Definition at line 137 of file mbyte.c.

{
  if (!str || !*str)
    return 0;
 
  bool nl = false;
  int total_width = 0;
  mbstate_t mbstate = { 0 };
 
  size_t str_len = mutt_str_len(str);
 
  while (*str && (str_len > 0))
  {
    wchar_t wc = L'\0';
    size_t consumed = mbrtowc(&wc, str, str_len, &mbstate);
    if (consumed == 0)
      break;
 
    if (consumed == ICONV_ILLEGAL_SEQ)
    {
      memset(&mbstate, 0, sizeof(mbstate));
      wc = ReplacementChar;
      consumed = 1;
    }
    else if (consumed == ICONV_BUF_TOO_SMALL)
    {
      wc = ReplacementChar;
      consumed = str_len;
    }
 
    int wchar_width = wcwidth(wc);
    if (wchar_width < 0)
      wchar_width = 1;
 
    if ((wc == L'\t') || (nl && (wc == L' ')))
    {
      /* correctly calc tab stop, even for sending as the line should look
       * pretty on the receiving end */
      nl = false;
      wchar_width = 8 - (col % 8);
    }
    else if (indent && (wc == '\n'))
    {
      /* track newlines for display-case: if we have a space after a newline,
       * assume 8 spaces as for display we always tab-fold */
      nl = true;
    }
 
    total_width += wchar_width;
    str += consumed;
    str_len -= consumed;
  }
 
  return total_width;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_mb_wcwidth()

int mutt_mb_wcwidth ( wchar_t wc )

Measure the screen width of a character.

Parameters

wc	Character to examine

Return values

num	Width in screen columns

Definition at line 198 of file mbyte.c.

{
  int n = wcwidth(wc);
  if (IsWPrint(wc) && (n > 0))
    return n;
  if (!(wc & ~0x7f))
    return 2;
  if (!(wc & ~0xffff))
    return 6;
  return 10;
}

Here is the caller graph for this function:

◆ mutt_mb_wcswidth()

int mutt_mb_wcswidth	(	const wchar_t *	s,
		size_t	n
	)

Measure the screen width of a string.

Parameters

s	String to measure
n	Length of string in characters

Return values

num	Width in screen columns

Definition at line 216 of file mbyte.c.

{
  if (!s)
    return 0;
 
  int w = 0;
  while (n--)
    w += mutt_mb_wcwidth(*s++);
  return w;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_mb_width_ceiling()

size_t mutt_mb_width_ceiling	(	const wchar_t *	s,
		size_t	n,
		int	w1
	)

Keep the end of the string on-screen.

Parameters

s	String being displayed
n	Length of string in characters
w1	Width limit

Return values

num	Chars to skip

Given a string and a width, determine how many characters from the beginning of the string should be skipped so that the string fits.

Definition at line 237 of file mbyte.c.

{
  if (!s)
    return 0;
 
  const wchar_t *s0 = s;
  int w = 0;
  for (; n; s++, n--)
    if ((w += mutt_mb_wcwidth(*s)) > w1)
      break;
  return s - s0;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ buf_mb_wcstombs()

void buf_mb_wcstombs	(	struct Buffer *	dest,
		const wchar_t *	wstr,
		size_t	wlen
	)

Convert a string from wide to multibyte characters.

Parameters

dest	Buffer for the result
wstr	Source wide string to convert
wlen	Length of the wide string

Definition at line 256 of file mbyte.c.

{
  if (!dest || !wstr)
    return;
 
  // Give ourselves 4 utf-8 bytes per wide character
  buf_alloc(dest, 4 * wlen);
 
  mbstate_t mbstate = { 0 };
  size_t k = 0;
 
  char *buf = dest->data;
  size_t buflen = dest->dsize;
 
  for (; (wlen > 0) && (buflen >= MB_LEN_MAX); buf += k, buflen -= k, wstr++, wlen--)
  {
    k = wcrtomb(buf, *wstr, &mbstate);
    if (k == ICONV_ILLEGAL_SEQ)
      break;
    if (*wstr == L'\0')
      break;
  }
 
  *buf = '\0';
  buf_fix_dptr(dest);
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_mb_mbstowcs()

size_t mutt_mb_mbstowcs	(	wchar_t **	pwbuf,
		size_t *	pwbuflen,
		size_t	i,
		const char *	buf
	)

Convert a string from multibyte to wide characters.

Parameters

[out]	pwbuf	Buffer for the result
[out]	pwbuflen	Length of the result buffer
[in]	i	Starting index into the result buffer
[in]	buf	String to convert

Return values

num	First character after the result

Definition at line 291 of file mbyte.c.

{
  if (!pwbuf || !pwbuflen || !buf)
    return 0;
 
  wchar_t wc = 0;
  mbstate_t mbstate = { 0 };
  size_t k;
  wchar_t *wbuf = *pwbuf;
  size_t wbuflen = *pwbuflen;
 
  while (*buf != '\0')
  {
    memset(&mbstate, 0, sizeof(mbstate));
    for (; (k = mbrtowc(&wc, buf, MB_LEN_MAX, &mbstate)) &&
           (k != ICONV_ILLEGAL_SEQ) && (k != ICONV_BUF_TOO_SMALL);
         buf += k)
    {
      if (i >= wbuflen)
      {
        wbuflen = i + 20;
        MUTT_MEM_REALLOC(&wbuf, wbuflen, wchar_t);
      }
      wbuf[i++] = wc;
    }
    if ((*buf != '\0') && ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL)))
    {
      if (i >= wbuflen)
      {
        wbuflen = i + 20;
        MUTT_MEM_REALLOC(&wbuf, wbuflen, wchar_t);
      }
      wbuf[i++] = ReplacementChar;
      buf++;
    }
  }
  *pwbuf = wbuf;
  *pwbuflen = wbuflen;
  return i;
}

Here is the caller graph for this function:

◆ mutt_mb_is_shell_char()

bool mutt_mb_is_shell_char ( wchar_t ch )

Is character not typically part of a pathname.

Parameters

ch	Character to examine

Return values

true	Character is not typically part of a pathname
false	Character is typically part of a pathname

Note: The name is very confusing.

Definition at line 340 of file mbyte.c.

{
  static const wchar_t shell_chars[] = L"<>&()$?*;{}| "; /* ! not included because it can be part of a pathname in NeoMutt */
  return wcschr(shell_chars, ch);
}

Here is the caller graph for this function:

◆ mutt_mb_is_lower()

bool mutt_mb_is_lower ( const char * s )

Does a multi-byte string contain only lowercase characters?

Parameters

s	String to check

Return values

true	String contains no uppercase characters
false	Error, or contains some uppercase characters

Non-alphabetic characters are considered lowercase.

Definition at line 354 of file mbyte.c.

{
  if (!s)
    return false;
 
  wchar_t wc = 0;
  mbstate_t mbstate = { 0 };
  size_t l;
 
  memset(&mbstate, 0, sizeof(mbstate));
  size_t n = mutt_str_len(s);
 
  for (; (n > 0) && (*s != '\0') && (l = mbrtowc(&wc, s, n, &mbstate)) != 0; s += l, n -= l)
  {
    if ((l == ICONV_BUF_TOO_SMALL) || (l == ICONV_ILLEGAL_SEQ))
      return false; // error; assume upper-case
    if (iswalpha((wint_t) wc) && iswupper((wint_t) wc))
      return false; // upper-case
  }
 
  return true; // lower-case
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_mb_is_display_corrupting_utf8()

bool mutt_mb_is_display_corrupting_utf8 ( wchar_t wc )

Will this character corrupt the display?

Parameters

wc	Character to examine

Return values

true	Character would corrupt the display
false	Character is safe to display

Note: This list isn't complete.

Definition at line 385 of file mbyte.c.

{
  if ((wc == (wchar_t) 0x00ad) || /* soft hyphen */
      (wc == (wchar_t) 0x200e) || /* left-to-right mark */
      (wc == (wchar_t) 0x200f) || /* right-to-left mark */
      (wc == (wchar_t) 0xfeff))   /* zero width no-break space */
  {
    return true;
  }
 
  /* left-to-right isolate, right-to-left isolate, first strong isolate,
   * pop directional isolate */
  if ((wc >= (wchar_t) 0x2066) && (wc <= (wchar_t) 0x2069))
    return true;
 
  /* left-to-right embedding, right-to-left embedding, pop directional formatting,
   * left-to-right override, right-to-left override */
  if ((wc >= (wchar_t) 0x202a) && (wc <= (wchar_t) 0x202e))
    return true;
 
  /* arabic letter mark */
  if (wc == (wchar_t) 0x061c)
    return true;
 
  return false;
}

Here is the caller graph for this function:

◆ mutt_mb_filter_unprintable()

int mutt_mb_filter_unprintable ( char ** s )

Replace unprintable characters.

Parameters

[in,out] s String to modify

Return values

0	Success
-1	Error

Unprintable characters will be replaced with ReplacementChar.

Note: The source string will be freed and a newly allocated string will be returned in its place. The caller should free the returned string.

Definition at line 423 of file mbyte.c.

{
  if (!s || !*s)
    return -1;
 
  wchar_t wc = 0;
  size_t k, k2;
  char scratch[MB_LEN_MAX + 1];
  char *p = *s;
  mbstate_t mbstate1 = { 0 };
  mbstate_t mbstate2 = { 0 };
 
  struct Buffer *buf = buf_pool_get();
  for (; (k = mbrtowc(&wc, p, MB_LEN_MAX, &mbstate1)); p += k)
  {
    if ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL))
    {
      k = 1;
      memset(&mbstate1, 0, sizeof(mbstate1));
      wc = ReplacementChar;
    }
    if (CharsetIsUtf8 && IsBOM(wc))
    {
      continue;
    }
    if (!IsWPrint(wc))
      wc = '?';
    else if (CharsetIsUtf8 && mutt_mb_is_display_corrupting_utf8(wc))
      continue;
    k2 = wcrtomb(scratch, wc, &mbstate2);
    scratch[k2] = '\0';
    buf_addstr(buf, scratch);
  }
  FREE(s);
 
  if (buf_is_empty(buf))
    *s = MUTT_MEM_CALLOC(1, char); // Fake empty string
  else
    *s = buf_strdup(buf);
 
  buf_pool_release(&buf);
  return 0;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Variable Documentation

◆ OptLocales

bool OptLocales

(pseudo) set if user has valid locale definition

Definition at line 44 of file mbyte.c.

Functions

Variables

Detailed Description

Function Documentation

◆ mutt_mb_charlen()

◆ mutt_mb_get_initials()

◆ mutt_mb_width()

◆ mutt_mb_wcwidth()

◆ mutt_mb_wcswidth()

◆ mutt_mb_width_ceiling()

◆ buf_mb_wcstombs()

◆ mutt_mb_mbstowcs()

◆ mutt_mb_is_shell_char()

◆ mutt_mb_is_lower()

◆ mutt_mb_is_display_corrupting_utf8()

◆ mutt_mb_filter_unprintable()

Variable Documentation

◆ OptLocales