NeoMutt  2020-08-07-1-gab41a1
Teaching an old dog new tricks
DOXYGEN
prex.c
Go to the documentation of this file.
1 
29 #include "config.h"
30 #include <assert.h>
31 #include <stdbool.h>
32 #include <stdint.h>
33 #include "prex.h"
34 #include "logging.h"
35 #include "memory.h"
36 
37 #ifdef HAVE_PCRE2
38 #define PCRE2_CODE_UNIT_WIDTH 8
39 #include <pcre2.h>
40 #include <string.h>
41 
46 static bool pcre2_has_unicode(void)
47 {
48  static uint32_t checked = -1;
49  if (checked == -1)
50  {
51  pcre2_config(PCRE2_CONFIG_UNICODE, &checked);
52  }
53  return checked;
54 }
55 #endif
56 
68 {
69  enum Prex which;
70  size_t nmatches;
71  const char *str;
72 #ifdef HAVE_PCRE2
73  pcre2_code *re;
74  pcre2_match_data *mdata;
75 #else
76  regex_t *re;
77 #endif
78  regmatch_t *matches;
79 };
80 
81 #define PREX_MONTH "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"
82 #define PREX_DOW "(Mon|Tue|Wed|Thu|Fri|Sat|Sun)"
83 #define PREX_DOW_NOCASE \
84  "([Mm][Oo][Nn]|[Tt][Uu][Ee]|[Ww][Ee][Dd]|[Tt][Hh][Uu]|[Ff][Rr][Ii]|" \
85  "[Ss][Aa][Tt]|[Ss][Uu][Nn])"
86 #define PREX_TIME "([[:digit:]]{2}:[[:digit:]]{2}:[[:digit:]]{2})"
87 #define PREX_YEAR "([[:digit:]]{4})"
88 
97 static struct PrexStorage *prex(enum Prex which)
98 {
99  static struct PrexStorage storage[] = {
100  /* clang-format off */
101  {
102  PREX_URL,
104  /* Spec: https://tools.ietf.org/html/rfc3986#section-3 */
105 #ifdef HAVE_PCRE2
106 #define UNR_PCTENC_SUBDEL "][\\p{L}\\p{N}._~%!$&'()*+,;="
107 #else
108 #define UNR_PCTENC_SUBDEL "][[:alnum:]._~%!$&'()*+,;="
109 #endif
110 #define PATH ":@/ "
111  "^([[:alpha:]][-+.[:alnum:]]+):" // . scheme
112  "(" // . rest
113  "(" // . . authority + path
114  // . . or path only
115  "(//" // . . . authority + path
116  "(" // . . . . user info
117  "([" UNR_PCTENC_SUBDEL "@-]*)" // . . . . . user name + '@'
118  "(:([" UNR_PCTENC_SUBDEL "-]*))?" // . . . . . password
119  "@)?"
120  "(" // . . . . host
121  "([" UNR_PCTENC_SUBDEL "-]*)" // . . . . . host name
122  "|"
123  "(\\[[[:xdigit:]:.]+\\])" // . . . . . IPv4 or IPv6
124  ")"
125  "(:([[:digit:]]+))?" // . . . . port
126  "(/([" UNR_PCTENC_SUBDEL PATH "-]*))?" // . . . . path
127  ")"
128  "|"
129  "(" // . . . path only
130  "[" UNR_PCTENC_SUBDEL PATH "-]*" // . . . . path
131  ")"
132  ")"
133  // Should be: "(\\?([" UNR_PCTENC_SUBDEL PATH "?-]*))?"
134  "(\\?([^#]*))?" // . . query
135  ")$"
136 #undef PATH
137 #undef UNR_PCTENC_SUBDEL
138  },
139  {
142 #define QUERY_PART "^&=" // Should be: "-[:alnum:]._~%!$'()*+,;:@/"
143  "([" QUERY_PART "]+)=([" QUERY_PART "]+)" // query + ' '
144 #undef QUERY_PART
145  },
146  {
149  "=\\?"
150  "([^][()<>@,;:\\\"/?. =]+)" // charset
151  "\\?"
152  "([qQbB])" // encoding
153  "\\?"
154  "([^?]+)" // encoded text - we accept whitespace, see #1189
155  "\\?="
156  },
157  {
160  "^\\#H ([[:alnum:]_\\.-]+) ([[:alnum:]]{4}( [[:alnum:]]{4}){7})[ \t]*$"
161  },
162  {
165  /* Spec: https://tools.ietf.org/html/rfc5322#section-3.3 */
166  "^"
167  "(" PREX_DOW ", )?" // Day of week
168  " *"
169  "([[:digit:]]{1,2}) " // Day
170  PREX_MONTH // Month
171  " ([[:digit:]]{2,4}) " // Year
172  "([[:digit:]]{2})" // Hour
173  ":([[:digit:]]{2})" // Minute
174  "(:([[:digit:]]{2}))?" // Second
175  " *"
176  "("
177  "([+-][[:digit:]]{4})|" // TZ
178  "([[:alpha:]]+)" // Obsolete TZ
179  ")"
180  },
181  {
184  /* Spec: https://tools.ietf.org/html/rfc5322#section-3.3 */
185 #define FWS " *"
186 #define C "(\\(.*\\))?"
187 #define CFWS FWS C FWS
188  "^"
189  CFWS
190  "(([[:alpha:]]+)" CFWS ", *)?" // Day of week (or whatever)
191  CFWS "([[:digit:]]{1,2}) " // Day
192  CFWS PREX_MONTH // Month
193  CFWS "([[:digit:]]{2,4}) " // Year
194  CFWS "([[:digit:]]{1,2})" // Hour
195  ":" CFWS "([[:digit:]]{1,2})" // Minute
196  CFWS
197  "(:" CFWS "([[:digit:]]{1,2}))?" // Second
198  CFWS
199  "("
200  "([+-][[:digit:]]{4})|" // TZ
201  "([[:alpha:]]+)" // Obsolete TZ
202  ")?"
203 #undef CFWS
204 #undef C
205 #undef FWS
206  },
207  {
210  "( ([[:digit:]])|([[:digit:]]{2}))" // Day
211  "-" PREX_MONTH // Month
212  "-" PREX_YEAR // Year
213  " " PREX_TIME // Time
214  " ([+-][[:digit:]]{4})" // TZ
215  },
216  {
219  /* Spec: http://qmail.omnis.ch/man/man5/mbox.html */
220  "^From " // From
221  "([^[:space:]]+) +" // Sender
222  PREX_DOW // Day of week
223  " +"
224  PREX_MONTH // Month
225  " ( ([[:digit:]])|([[:digit:]]{2}))" // Day
226  " +"
227  PREX_TIME // Time
228  " +"
229  PREX_YEAR // Year
230  },
231  {
234  /* Spec: http://qmail.omnis.ch/man/man5/mbox.html */
235  "^From " // From
236  "("
237  "[^[:space:]]+" // Sender
238  "( at [^[:space:]]+)?" // Possibly obfuscated, pipermail-style
239  ")?"
240  " *"
241  PREX_DOW_NOCASE // Day of week
242  " +"
243  PREX_MONTH // Month
244  " +"
245  "( " // Day
246  "([[:digit:]])|"
247  "([[:digit:]]{2})"
248  ")"
249  " +"
250  "("
251  PREX_TIME // Time (HH:MM:SS)
252  "|"
253  "([[:digit:]]{2}" // Time (HH:MM)
254  ":[[:digit:]]{2})"
255  ")"
256  " +"
257  "([[:alpha:] ]*)" // Timezone (which we skip)
258  "("
259  PREX_YEAR // Year (YYYY)
260  "|"
261  "([[:digit:]]{2})" // Year (YY)
262  ")"
263  }
264  /* clang-format on */
265  };
266 
267  assert((which >= 0) && (which < PREX_MAX) && "Invalid 'which' argument");
268  struct PrexStorage *h = &storage[which];
269  assert((which == h->which) && "Fix 'storage' array");
270  if (!h->re)
271  {
272 #ifdef HAVE_PCRE2
273  uint32_t opt = pcre2_has_unicode() ? PCRE2_UTF : 0;
274  int eno;
275  PCRE2_SIZE eoff;
276  h->re = pcre2_compile((PCRE2_SPTR8) h->str, PCRE2_ZERO_TERMINATED, opt,
277  &eno, &eoff, NULL);
278  if (!h->re)
279  {
280  assert("Fix your RE");
281  }
282  h->mdata = pcre2_match_data_create_from_pattern(h->re, NULL);
283  uint32_t ccount;
284  pcre2_pattern_info(h->re, PCRE2_INFO_CAPTURECOUNT, &ccount);
285  assert(ccount + 1 == h->nmatches && "Number of matches do not match (...)");
286  h->matches = mutt_mem_calloc(h->nmatches, sizeof(*h->matches));
287 #else
288  h->re = mutt_mem_calloc(1, sizeof(*h->re));
289  if (regcomp(h->re, h->str, REG_EXTENDED) != 0)
290  {
291  assert("Fix your RE");
292  }
293  h->matches = mutt_mem_calloc(h->nmatches, sizeof(*h->matches));
294 #endif
295  }
296  return h;
297 }
298 
306 regmatch_t *mutt_prex_capture(enum Prex which, const char *str)
307 {
308  if (!str)
309  return NULL;
310 
311  struct PrexStorage *h = prex(which);
312 #ifdef HAVE_PCRE2
313  size_t len = strlen(str);
314  int rc = pcre2_match(h->re, (PCRE2_SPTR8) str, len, 0, 0, h->mdata, NULL);
315  if (rc < 0)
316  {
317  PCRE2_UCHAR errmsg[1024];
318  pcre2_get_error_message(rc, errmsg, sizeof(errmsg));
319  mutt_debug(LL_DEBUG2, "pcre2_match - <%s> -> <%s> = %s\n", h->str, str, errmsg);
320  return NULL;
321  }
322  PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(h->mdata);
323  int i = 0;
324  for (; i < rc; i++)
325  {
326  h->matches[i].rm_so = ovector[i * 2];
327  h->matches[i].rm_eo = ovector[i * 2 + 1];
328  }
329  for (; i < h->nmatches; i++)
330  {
331  h->matches[i].rm_so = -1;
332  h->matches[i].rm_eo = -1;
333  }
334 #else
335  if (regexec(h->re, str, h->nmatches, h->matches, 0))
336  return NULL;
337 
338  assert((h->re->re_nsub == (h->nmatches - 1)) &&
339  "Regular expression and matches enum are out of sync");
340 #endif
341  return h->matches;
342 }
343 
347 void mutt_prex_free(void)
348 {
349  for (enum Prex which = 0; which < PREX_MAX; which++)
350  {
351  struct PrexStorage *h = prex(which);
352 #ifdef HAVE_PCRE2
353  pcre2_match_data_free(h->mdata);
354  pcre2_code_free(h->re);
355 #else
356  regfree(h->re);
357  FREE(&h->re);
358 #endif
359  FREE(&h->matches);
360  }
361 }
regex_t * re
Compiled regex.
Definition: prex.c:76
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
Manage precompiled / predefined regular expressions.
Memory management wrappers.
#define QUERY_PART
Definition: prex.h:42
Prex
Predefined list of regular expressions.
Definition: prex.h:31
[16-MAR-2020 15:09:35 -0700]
Definition: prex.h:39
size_t nmatches
Number of regex matches.
Definition: prex.c:70
regmatch_t * matches
Resulting matches.
Definition: prex.c:78
Logging Dispatcher.
A predefined / precompiled regex.
Definition: prex.c:67
Log at debug level 2.
Definition: logging.h:41
#define UNR_PCTENC_SUBDEL
enum Prex which
Regex type, e.g. PREX_URL.
Definition: prex.c:69
#define PREX_DOW
Definition: prex.c:82
[Mon, (Comment) 16 Mar 2020 15:09:35 -0700]
Definition: prex.h:38
regmatch_t * mutt_prex_capture(enum Prex which, const char *str)
match a precompiled regex against a string
Definition: prex.c:306
https://example.com/?[q=foo]
Definition: prex.h:34
static struct PrexStorage * prex(enum Prex which)
Compile on demand and get data for a predefined regex.
Definition: prex.c:97
#define PREX_TIME
Definition: prex.c:86
[imaps://user:pass@example.com/INBOX?foo=bar]
Definition: prex.h:33
[=?utf-8?Q?=E8=81=AA=E6=98=8E=E7=9A=84?=]
Definition: prex.h:35
[From god@heaven.af.mil Sat Jan 3 01:05:34 1996]
Definition: prex.h:40
#define CFWS
[Mon, 16 Mar 2020 15:09:35 -0700]
Definition: prex.h:37
[#H foo.com A76D 954B EB79 1F49 5B3A 0A0E 0681 65B1]
Definition: prex.h:36
void mutt_prex_free(void)
Cleanup heap memory allocated by compiled regexes.
Definition: prex.c:347
#define FREE(x)
Definition: memory.h:40
#define mutt_debug(LEVEL,...)
Definition: logging.h:81
#define PATH
#define PREX_DOW_NOCASE
Definition: prex.c:83
#define PREX_YEAR
Definition: prex.c:87
#define PREX_MONTH
Definition: prex.c:81
[From god@heaven.af.mil Sat Jan 3 01:05:34 1996]
Definition: prex.h:41
const char * str
Regex string.
Definition: prex.c:71