NeoMutt  2023-05-17-33-gce4425
Teaching an old dog new tricks
DOXYGEN
prex.c
Go to the documentation of this file.
1
29#include "config.h"
30#include <assert.h>
31#include <stdbool.h>
32#include <stdint.h>
33#include "prex.h"
34#include "logging2.h"
35#include "memory.h"
36
37#ifdef HAVE_PCRE2
38#define PCRE2_CODE_UNIT_WIDTH 8
39#include <pcre2.h>
40#include <string.h>
41
46static bool pcre2_has_unicode(void)
47{
48 static uint32_t checked = -1;
49 if (checked == -1)
50 {
51 pcre2_config(PCRE2_CONFIG_UNICODE, &checked);
52 }
53 return checked;
54}
55#endif
56
68{
69 enum Prex which;
70 size_t nmatches;
71 const char *str;
72#ifdef HAVE_PCRE2
73 pcre2_code *re;
74 pcre2_match_data *mdata;
75#else
76 regex_t *re;
77#endif
78 regmatch_t *matches;
79};
80
81#define PREX_MONTH "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"
82#define PREX_DOW "(Mon|Tue|Wed|Thu|Fri|Sat|Sun)"
83#define PREX_DOW_NOCASE \
84 "([Mm][Oo][Nn]|[Tt][Uu][Ee]|[Ww][Ee][Dd]|[Tt][Hh][Uu]|[Ff][Rr][Ii]|[Ss][Aa][Tt]|[Ss][Uu][Nn])"
85#define PREX_TIME "([[:digit:]]{2}:[[:digit:]]{2}:[[:digit:]]{2})"
86#define PREX_YEAR "([[:digit:]]{4})"
87
96static struct PrexStorage *prex(enum Prex which)
97{
98 static struct PrexStorage storage[] = {
99 // clang-format off
100 {
101 PREX_URL,
103 /* Spec: https://tools.ietf.org/html/rfc3986#section-3 */
104#ifdef HAVE_PCRE2
105#define UNR_PCTENC_SUBDEL "][\\p{L}\\p{N}._~%!$&'()*+,;="
106#else
107#define UNR_PCTENC_SUBDEL "][[:alnum:]._~%!$&'()*+,;="
108#endif
109#define PATH ":@/ "
110 "^([[:alpha:]][-+.[:alnum:]]+):" // . scheme
111 "(" // . rest
112 "(" // . . authority + path
113 // . . or path only
114 "(//" // . . . authority + path
115 "(" // . . . . user info
116 "([" UNR_PCTENC_SUBDEL "@-]*)" // . . . . . user name + '@'
117 "(:([" UNR_PCTENC_SUBDEL "-]*))?" // . . . . . password
118 "@)?"
119 "(" // . . . . host
120 "([" UNR_PCTENC_SUBDEL "-]*)" // . . . . . host name
121 "|"
122 "(\\[[[:xdigit:]:.]+\\])" // . . . . . IPv4 or IPv6
123 ")"
124 "(:([[:digit:]]+))?" // . . . . port
125 "(/([" UNR_PCTENC_SUBDEL PATH "-]*))?" // . . . . path
126 ")"
127 "|"
128 "(" // . . . path only
129 "[" UNR_PCTENC_SUBDEL PATH "-]*" // . . . . path
130 ")"
131 ")"
132 // Should be: "(\\?([" UNR_PCTENC_SUBDEL PATH "?-]*))?"
133 "(\\?([^#]*))?" // . . query
134 ")$"
135#undef PATH
136#undef UNR_PCTENC_SUBDEL
137 },
138 {
141#define QUERY_PART "^&=" // Should be: "-[:alnum:]._~%!$'()*+,;:@/"
142 "([" QUERY_PART "]+)=([" QUERY_PART "]+)" // query + ' '
143#undef QUERY_PART
144 },
145 {
148 "=\\?"
149 "([^][()<>@,;:\\\"/?. =]+)" // charset
150 "\\?"
151 "([qQbB])" // encoding
152 "\\?"
153 "([^?]+)" // encoded text - we accept whitespace, see #1189
154 "\\?="
155 },
156 {
159 "^\\#H ([[:alnum:]_\\.-]+) ([[:alnum:]]{4}( [[:alnum:]]{4}){7})[ \t]*$"
160 },
161 {
164 /* Spec: https://tools.ietf.org/html/rfc5322#section-3.3 */
165#define FWS " *"
166#define C "(\\(.*\\))?"
167#define CFWS FWS C FWS
168 "^"
169 CFWS
170 "(([[:alpha:]]+)" CFWS ", *)?" // Day of week (or whatever)
171 CFWS "([[:digit:]]{1,2}) " // Day
172 CFWS PREX_MONTH // Month
173 CFWS "([[:digit:]]{2,4}) " // Year
174 CFWS "([[:digit:]]{1,2})" // Hour
175 ":" CFWS "([[:digit:]]{1,2})" // Minute
176 CFWS
177 "(:" CFWS "([[:digit:]]{1,2}))?" // Second
178 CFWS
179 "("
180 "([+-][[:digit:]]{4})|" // TZ
181 "([[:alpha:]]+)" // Obsolete TZ
182 ")?"
183#undef CFWS
184#undef C
185#undef FWS
186 },
187 {
190 "( ([[:digit:]])|([[:digit:]]{2}))" // Day
191 "-" PREX_MONTH // Month
192 "-" PREX_YEAR // Year
193 " " PREX_TIME // Time
194 " ([+-][[:digit:]]{4})" // TZ
195 },
196 {
199 /* Spec: http://qmail.omnis.ch/man/man5/mbox.html */
200 "^From " // From
201 "([^[:space:]]+) +" // Sender
202 PREX_DOW // Day of week
203 " +"
204 PREX_MONTH // Month
205 " ( ([[:digit:]])|([[:digit:]]{2}))" // Day
206 " +"
207 PREX_TIME // Time
208 " +"
209 PREX_YEAR // Year
210 },
211 {
214 /* Spec: http://qmail.omnis.ch/man/man5/mbox.html */
215 "^From " // From
216 "("
217 "[^[:space:]]+" // Sender
218 "( at [^[:space:]]+)?" // Possibly obfuscated, pipermail-style
219 ")?"
220 " *"
221 PREX_DOW_NOCASE // Day of week
222 " +"
223 PREX_MONTH // Month
224 " +"
225 "( " // Day
226 "([[:digit:]])|"
227 "([[:digit:]]{2})"
228 ")"
229 " +"
230 "("
231 PREX_TIME // Time (HH:MM:SS)
232 "|"
233 "([[:digit:]]{2}" // Time (HH:MM)
234 ":[[:digit:]]{2})"
235 ")"
236 " +"
237 "("
238 "([[:alpha:] ]+)|" // Timezone name (which we skip)
239 "([+][[:digit:]]{4} )" // Timezone offset (which we skip)
240 ")?"
241 "("
242 PREX_YEAR // Year (YYYY)
243 "|"
244 "([[:digit:]]{2})" // Year (YY)
245 ")"
246 },
247 {
250 "^([[:alpha:]]+): (.*)$"
251 },
252 // clang-format on
253 };
254
255 assert((which < PREX_MAX) && "Invalid 'which' argument");
256 struct PrexStorage *h = &storage[which];
257 assert((which == h->which) && "Fix 'storage' array");
258 if (!h->re)
259 {
260#ifdef HAVE_PCRE2
261 uint32_t opt = pcre2_has_unicode() ? PCRE2_UTF : 0;
262 int eno = 0;
263 PCRE2_SIZE eoff = 0;
264 h->re = pcre2_compile((PCRE2_SPTR8) h->str, PCRE2_ZERO_TERMINATED, opt,
265 &eno, &eoff, NULL);
266 assert(h->re && "Fix your RE");
267 h->mdata = pcre2_match_data_create_from_pattern(h->re, NULL);
268 uint32_t ccount = 0;
269 pcre2_pattern_info(h->re, PCRE2_INFO_CAPTURECOUNT, &ccount);
270 assert(((ccount + 1) == h->nmatches) && "Number of matches do not match (...)");
271 h->matches = mutt_mem_calloc(h->nmatches, sizeof(*h->matches));
272#else
273 h->re = mutt_mem_calloc(1, sizeof(*h->re));
274 const int rc = regcomp(h->re, h->str, REG_EXTENDED);
275 assert(rc == 0 && "Fix your RE");
276 h->matches = mutt_mem_calloc(h->nmatches, sizeof(*h->matches));
277#endif
278 }
279 return h;
280}
281
289regmatch_t *mutt_prex_capture(enum Prex which, const char *str)
290{
291 if (!str)
292 return NULL;
293
294 struct PrexStorage *h = prex(which);
295#ifdef HAVE_PCRE2
296 size_t len = strlen(str);
297 int rc = pcre2_match(h->re, (PCRE2_SPTR8) str, len, 0, 0, h->mdata, NULL);
298 if (rc < 0)
299 {
300 PCRE2_UCHAR errmsg[1024];
301 pcre2_get_error_message(rc, errmsg, sizeof(errmsg));
302 mutt_debug(LL_DEBUG2, "pcre2_match - <%s> -> <%s> = %s\n", h->str, str, errmsg);
303 return NULL;
304 }
305 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(h->mdata);
306 int i = 0;
307 for (; i < rc; i++)
308 {
309 h->matches[i].rm_so = ovector[i * 2];
310 h->matches[i].rm_eo = ovector[i * 2 + 1];
311 }
312 for (; i < h->nmatches; i++)
313 {
314 h->matches[i].rm_so = -1;
315 h->matches[i].rm_eo = -1;
316 }
317#else
318 if (regexec(h->re, str, h->nmatches, h->matches, 0))
319 return NULL;
320
321 assert((h->re->re_nsub == (h->nmatches - 1)) &&
322 "Regular expression and matches enum are out of sync");
323#endif
324 return h->matches;
325}
326
331{
332 for (enum Prex which = 0; which < PREX_MAX; which++)
333 {
334 struct PrexStorage *h = prex(which);
335#ifdef HAVE_PCRE2
336 pcre2_match_data_free(h->mdata);
337 pcre2_code_free(h->re);
338#else
339 regfree(h->re);
340 FREE(&h->re);
341#endif
342 FREE(&h->matches);
343 }
344}
#define mutt_debug(LEVEL,...)
Definition: logging2.h:84
Logging Dispatcher.
@ LL_DEBUG2
Log at debug level 2.
Definition: logging2.h:41
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:50
Memory management wrappers.
#define FREE(x)
Definition: memory.h:43
#define PREX_DOW_NOCASE
Definition: prex.c:83
regmatch_t * mutt_prex_capture(enum Prex which, const char *str)
Match a precompiled regex against a string.
Definition: prex.c:289
#define CFWS
#define PREX_DOW
Definition: prex.c:82
#define PREX_TIME
Definition: prex.c:85
#define QUERY_PART
#define PREX_MONTH
Definition: prex.c:81
static struct PrexStorage * prex(enum Prex which)
Compile on demand and get data for a predefined regex.
Definition: prex.c:96
void mutt_prex_free(void)
Cleanup heap memory allocated by compiled regexes.
Definition: prex.c:330
#define PATH
#define PREX_YEAR
Definition: prex.c:86
#define UNR_PCTENC_SUBDEL
Manage precompiled / predefined regular expressions.
@ PREX_MBOX_FROM_LAX_MATCH_MAX
Definition: prex.h:214
@ PREX_ACCOUNT_CMD_MATCH_MAX
Definition: prex.h:225
@ PREX_IMAP_DATE_MATCH_MAX
Definition: prex.h:168
@ PREX_MBOX_FROM_MATCH_MAX
Definition: prex.h:187
@ PREX_RFC2047_ENCODED_WORD_MATCH_MAX
Definition: prex.h:98
@ PREX_URL_QUERY_KEY_VAL_MATCH_MAX
Definition: prex.h:84
Prex
Predefined list of regular expressions.
Definition: prex.h:32
@ PREX_GNUTLS_CERT_HOST_HASH
[#H foo.com A76D 954B EB79 1F49 5B3A 0A0E 0681 65B1]
Definition: prex.h:36
@ PREX_MBOX_FROM_LAX
[From god@heaven.af.mil Sat Jan 3 01:05:34 1996]
Definition: prex.h:40
@ PREX_URL
[imaps://user:pass@example.com/INBOX?foo=bar]
Definition: prex.h:33
@ PREX_MBOX_FROM
[From god@heaven.af.mil Sat Jan 3 01:05:34 1996]
Definition: prex.h:39
@ PREX_ACCOUNT_CMD
key: value
Definition: prex.h:41
@ PREX_IMAP_DATE
[16-MAR-2020 15:09:35 -0700]
Definition: prex.h:38
@ PREX_RFC5322_DATE_LAX
[Mon, (Comment) 16 Mar 2020 15:09:35 -0700]
Definition: prex.h:37
@ PREX_URL_QUERY_KEY_VAL
https://example.com/?[q=foo]
Definition: prex.h:34
@ PREX_MAX
Definition: prex.h:42
@ PREX_RFC2047_ENCODED_WORD
[=?utf-8?Q?=E8=81=AA=E6=98=8E=E7=9A=84?=]
Definition: prex.h:35
@ PREX_RFC5322_DATE_LAX_MATCH_MAX
Definition: prex.h:150
@ PREX_URL_MATCH_MAX
Definition: prex.h:71
@ PREX_GNUTLS_CERT_HOST_HASH_MATCH_MAX
Definition: prex.h:112
A predefined / precompiled regex.
Definition: prex.c:68
const char * str
Regex string.
Definition: prex.c:71
enum Prex which
Regex type, e.g. PREX_URL.
Definition: prex.c:69
size_t nmatches
Number of regex matches.
Definition: prex.c:70
regex_t * re
Compiled regex.
Definition: prex.c:76
regmatch_t * matches
Resulting matches.
Definition: prex.c:78