#include "regex.h"
#include "stdlib.h"
#include "string.h"
#include "ctype.h"
/*
* Minimal POSIX regex — recursive backtracking matcher.
* Supports BRE and ERE: . * + ? | ^ $ [] [^] () \1-\9
* POSIX character classes: [:alpha:] [:digit:] [:alnum:] [:space:] etc.
*/
#define MAX_SUB 10
#define CLASS_BYTES 32
/* Internal compiled instruction */
enum {
OP_LIT, OP_ANY, OP_CLASS, OP_NCLASS,
OP_BOL, OP_EOL,
OP_STAR, OP_PLUS, OP_QUEST, /* greedy quantifiers: operand follows */
OP_SAVE_OPEN, OP_SAVE_CLOSE, /* sub-match boundary */
OP_JMP, OP_SPLIT, /* for alternation */
OP_MATCH,
};
typedef struct {
unsigned char type;
unsigned char c; /* literal char */
unsigned char cclass[CLASS_BYTES];
short arg; /* jmp offset or save slot */
} inst_t;
typedef struct {
inst_t* code;
int len;
int cap;
int nsub;
int cflags;
} prog_t;
/* ---- helpers ---- */
static void cc_set(unsigned char* cc, int ch) {
cc[(unsigned)ch / 8] |= (unsigned char)(1 << ((unsigned)ch % 8));
}
static int cc_test(const unsigned char* cc, int ch) {
return (cc[(unsigned)ch / 8] >> ((unsigned)ch % 8)) & 1;
}
static inst_t* emit(prog_t* p) {
if (p->len >= p->cap) {
int nc = p->cap ? p->cap * 2 : 64;
inst_t* tmp = (inst_t*)realloc(p->code, (size_t)nc * sizeof(inst_t));
if (!tmp) return (inst_t*)0;
p->code = tmp;
p->cap = nc;
}
inst_t* ip = &p->code[p->len];
memset(ip, 0, sizeof(*ip));
p->len++;
return ip;
}
/* Parse bracket expression, advance *pp past ']'. */
static void parse_bracket(const char** pp, unsigned char* cc, int* negate, int icase) {
const char* p = *pp;
*negate = 0;
memset(cc, 0, CLASS_BYTES);
if (*p == '^') { *negate = 1; p++; }
if (*p == ']') { cc_set(cc, ']'); p++; }
while (*p && *p != ']') {
if (*p == '[' && p[1] == ':') {
p += 2;
const char* end = p;
while (*end && !(*end == ':' && end[1] == ']')) end++;
size_t len = (size_t)(end - p);
for (int ch = 1; ch < 256; ch++) {
int m = 0;
if (len==5 && !memcmp(p,"alpha",5)) m = isalpha(ch);
else if (len==5 && !memcmp(p,"digit",5)) m = isdigit(ch);
else if (len==5 && !memcmp(p,"alnum",5)) m = isalnum(ch);
else if (len==5 && !memcmp(p,"space",5)) m = isspace(ch);
else if (len==5 && !memcmp(p,"upper",5)) m = isupper(ch);
else if (len==5 && !memcmp(p,"lower",5)) m = islower(ch);
else if (len==5 && !memcmp(p,"print",5)) m = isprint(ch);
else if (len==5 && !memcmp(p,"graph",5)) m = isgraph(ch);
else if (len==5 && !memcmp(p,"cntrl",5)) m = iscntrl(ch);
else if (len==5 && !memcmp(p,"punct",5)) m = ispunct(ch);
else if (len==6 && !memcmp(p,"xdigit",6)) m = isxdigit(ch);
else if (len==5 && !memcmp(p,"blank",5)) m = (ch==' '||ch=='\t');
if (m) cc_set(cc, ch);
}
if (*end) p = end + 2; else p = end;
continue;
}
int c1 = (unsigned char)*p++;
if (*p == '-' && p[1] && p[1] != ']') {
p++;
int c2 = (unsigned char)*p++;
for (int c = c1; c <= c2; c++) cc_set(cc, c);
} else {
cc_set(cc, c1);
}
}
if (*p == ']') p++;
if (icase) {
for (int ch = 0; ch < 256; ch++) {
if (cc_test(cc, ch)) {
if (ch >= 'a' && ch <= 'z') cc_set(cc, ch - 32);
if (ch >= 'A' && ch <= 'Z') cc_set(cc, ch + 32);
}
}
}
*pp = p;
}
/* ---- Compiler: pattern string -> inst_t[] ---- */
/* Forward declarations */
static int compile_alt(prog_t* P, const char** pp, int ere, int icase);
static int compile_concat(prog_t* P, const char** pp, int ere, int icase);
static int compile_quant(prog_t* P, const char** pp, int ere, int icase);
static int compile_atom(prog_t* P, const char** pp, int ere, int icase);
static int compile_atom(prog_t* P, const char** pp, int ere, int icase) {
const char* p = *pp;
if (!*p) return -1;
/* Group: ( or \( */
if ((ere && *p == '(') || (!ere && *p == '\\' && p[1] == '(')) {
p += ere ? 1 : 2;
int sub = ++P->nsub;
inst_t* ip = emit(P); if (!ip) return REG_ESPACE;
ip->type = OP_SAVE_OPEN; ip->arg = (short)(sub * 2);
*pp = p;
int r = compile_alt(P, pp, ere, icase);
if (r) return r;
p = *pp;
if ((ere && *p == ')') || (!ere && *p == '\\' && p[1] == ')')) {
p += ere ? 1 : 2;
}
ip = emit(P); if (!ip) return REG_ESPACE;
ip->type = OP_SAVE_CLOSE; ip->arg = (short)(sub * 2 + 1);
*pp = p;
return 0;
}
/* ^ */
if (*p == '^') {
inst_t* ip = emit(P); if (!ip) return REG_ESPACE;
ip->type = OP_BOL;
*pp = p + 1;
return 0;
}
/* $ */
if (*p == '$') {
inst_t* ip = emit(P); if (!ip) return REG_ESPACE;
ip->type = OP_EOL;
*pp = p + 1;
return 0;
}
/* . */
if (*p == '.') {
inst_t* ip = emit(P); if (!ip) return REG_ESPACE;
ip->type = OP_ANY;
*pp = p + 1;
return 0;
}
/* [bracket] */
if (*p == '[') {
p++;
inst_t* ip = emit(P); if (!ip) return REG_ESPACE;
int neg = 0;
parse_bracket(&p, ip->cclass, &neg, icase);
ip->type = neg ? OP_NCLASS : OP_CLASS;
*pp = p;
return 0;
}
/* Escape */
if (*p == '\\' && p[1]) {
p++;
/* Check for end-of-group */
if (!ere && (*p == ')' || *p == '|')) return -1;
inst_t* ip = emit(P); if (!ip) return REG_ESPACE;
ip->type = OP_LIT;
ip->c = (unsigned char)*p;
*pp = p + 1;
return 0;
}
/* ERE special: ) | are not atoms */
if (ere && (*p == ')' || *p == '|')) return -1;
/* BRE: \) \| are not atoms */
if (!ere && *p == '\\' && (p[1] == ')' || p[1] == '|')) return -1;
/* Literal */
{
inst_t* ip = emit(P); if (!ip) return REG_ESPACE;
ip->type = OP_LIT;
ip->c = (unsigned char)*p;
if (icase && *p >= 'A' && *p <= 'Z') ip->c = (unsigned char)(*p + 32);
if (icase && *p >= 'a' && *p <= 'z') ip->c = (unsigned char)*p;
*pp = p + 1;
return 0;
}
}
static int compile_quant(prog_t* P, const char** pp, int ere, int icase) {
int atom_start = P->len;
int r = compile_atom(P, pp, ere, icase);
if (r) return r;
int atom_end = P->len;
const char* p = *pp;
int qtype = -1;
if (*p == '*') { qtype = OP_STAR; p++; }
else if (ere && *p == '+') { qtype = OP_PLUS; p++; }
else if (ere && *p == '?') { qtype = OP_QUEST; p++; }
if (qtype >= 0) {
/* Wrap: insert a quantifier instruction referencing the atom range */
/* Implementation: we store quantifier type and atom length so the
* matcher can loop over the atom instructions. */
inst_t* ip = emit(P); if (!ip) return REG_ESPACE;
ip->type = (unsigned char)qtype;
ip->arg = (short)(atom_end - atom_start); /* length of atom code */
*pp = p;
}
return 0;
}
static int compile_concat(prog_t* P, const char** pp, int ere, int icase) {
const char* p = *pp;
if (!*p) return 0;
while (*p) {
/* Check for alternation or group close */
if (ere && (*p == '|' || *p == ')')) break;
if (!ere && *p == '\\' && (p[1] == '|' || p[1] == ')')) break;
*pp = p;
int r = compile_quant(P, pp, ere, icase);
if (r == -1) break; /* not an atom */
if (r > 0) return r; /* error */
p = *pp;
}
*pp = p;
return 0;
}
static int compile_alt(prog_t* P, const char** pp, int ere, int icase) {
/* Save position for SPLIT patching */
int start = P->len;
int r = compile_concat(P, pp, ere, icase);
if (r > 0) return r;
const char* p = *pp;
while ((ere && *p == '|') || (!ere && *p == '\\' && p[1] == '|')) {
p += ere ? 1 : 2;
/* Insert SPLIT at start, JMP at end of first branch */
/* For simplicity, emit JMP at end of current branch */
inst_t* jmp = emit(P); if (!jmp) return REG_ESPACE;
jmp->type = OP_JMP;
/* jmp->arg will be patched to point past alt */
int jmp_idx = P->len - 1;
/* Insert SPLIT at start */
/* Actually, alternation in a linear bytecode is complex.
* Let's use a simpler approach: mark alternation boundaries
* and handle in the matcher with backtracking. */
/* For now, store SPLIT with offset to second branch */
inst_t* split = emit(P); if (!split) return REG_ESPACE;
split->type = OP_SPLIT;
split->arg = (short)(jmp_idx - start); /* first branch length */
*pp = p;
r = compile_concat(P, pp, ere, icase);
if (r > 0) return r;
p = *pp;
/* Patch JMP to skip past this alternative */
P->code[jmp_idx].arg = (short)(P->len - jmp_idx - 1);
}
*pp = p;
(void)start;
return 0;
}
/* ---- Backtracking matcher ---- */
typedef struct {
const inst_t* code;
int codelen;
const char* str;
const char* str_start;
int eflags;
int cflags;
const char* sub[MAX_SUB * 2]; /* submatch start/end pointers */
} mstate_t;
static int match_inst(mstate_t* M, int pc, const char* sp);
static int match_char(const inst_t* ip, int ch, int icase) {
if (ip->type == OP_ANY) return (ch != '\n' && ch != 0);
if (ip->type == OP_LIT) {
int c = ip->c;
if (icase) {
int lch = (ch >= 'A' && ch <= 'Z') ? ch + 32 : ch;
int lc = (c >= 'A' && c <= 'Z') ? c + 32 : c;
return lch == lc;
}
return ch == c;
}
if (ip->type == OP_CLASS) return cc_test(ip->cclass, ch);
if (ip->type == OP_NCLASS) return !cc_test(ip->cclass, ch) && ch != '\n' && ch != 0;
return 0;
}
static int match_inst(mstate_t* M, int pc, const char* sp) {
while (pc < M->codelen) {
const inst_t* ip = &M->code[pc];
switch (ip->type) {
case OP_LIT:
case OP_ANY:
case OP_CLASS:
case OP_NCLASS:
if (*sp == '\0') return 0;
if (!match_char(ip, (unsigned char)*sp, M->cflags & REG_ICASE)) return 0;
sp++;
pc++;
break;
case OP_BOL:
if (sp != M->str_start && sp[-1] != '\n') {
if (M->eflags & REG_NOTBOL) return 0;
if (sp != M->str_start) return 0;
}
pc++;
break;
case OP_EOL:
if (*sp != '\0' && *sp != '\n') {
if (M->eflags & REG_NOTEOL) return 0;
if (*sp != '\0') return 0;
}
pc++;
break;
case OP_SAVE_OPEN:
case OP_SAVE_CLOSE: {
int slot = ip->arg;
if (slot >= 0 && slot < MAX_SUB * 2) {
const char* old = M->sub[slot];
M->sub[slot] = sp;
if (match_inst(M, pc + 1, sp)) return 1;
M->sub[slot] = old;
return 0;
}
pc++;
break;
}
case OP_STAR: {
/* Greedy: try matching atom as many times as possible, then backtrack */
int alen = ip->arg; /* atom is at pc - alen to pc - 1 */
int atom_pc = pc - alen;
pc++; /* skip past STAR instruction */
/* Count max matches */
const char* positions[256];
int count = 0;
positions[0] = sp;
while (count < 255) {
const char* save_sp = sp;
/* Try to match atom */
mstate_t tmp = *M;
if (!match_inst(&tmp, atom_pc, sp)) break;
/* Atom matched: advance sp by how much the atom consumed */
/* We need to figure out where sp ended up — tricky with backtracking */
/* Simpler approach: try one char at a time for simple atoms */
/* Actually, let's just try greedy: match rest at each possible count */
/* For single-char atoms (common case), each match is 1 char */
if (alen == 1 && (M->code[atom_pc].type == OP_LIT ||
M->code[atom_pc].type == OP_ANY ||
M->code[atom_pc].type == OP_CLASS ||
M->code[atom_pc].type == OP_NCLASS)) {
if (!match_char(&M->code[atom_pc], (unsigned char)*sp, M->cflags & REG_ICASE)) break;
sp++;
} else {
/* Multi-instruction atom — skip by trying char-by-char */
if (*sp == '\0') break;
sp++; /* approximate: advance by 1 */
}
count++;
positions[count] = sp;
(void)save_sp;
}
/* Greedy: try from max count down to 0 */
for (int i = count; i >= 0; i--) {
if (match_inst(M, pc, positions[i])) return 1;
}
return 0;
}
case OP_PLUS: {
int alen = ip->arg;
int atom_pc = pc - alen;
pc++;
const char* positions[256];
int count = 0;
const char* cur_sp = sp;
while (count < 255) {
if (alen == 1 && (M->code[atom_pc].type == OP_LIT ||
M->code[atom_pc].type == OP_ANY ||
M->code[atom_pc].type == OP_CLASS ||
M->code[atom_pc].type == OP_NCLASS)) {
if (*cur_sp == '\0') break;
if (!match_char(&M->code[atom_pc], (unsigned char)*cur_sp, M->cflags & REG_ICASE)) break;
cur_sp++;
} else {
if (*cur_sp == '\0') break;
cur_sp++;
}
count++;
positions[count] = cur_sp;
}
if (count == 0) return 0; /* + requires at least 1 */
for (int i = count; i >= 1; i--) {
if (match_inst(M, pc, positions[i])) return 1;
}
return 0;
}
case OP_QUEST: {
int alen = ip->arg;
int atom_pc = pc - alen;
pc++;
/* Greedy: try with atom, then without */
const char* after_atom = sp;
if (alen == 1 && (M->code[atom_pc].type == OP_LIT ||
M->code[atom_pc].type == OP_ANY ||
M->code[atom_pc].type == OP_CLASS ||
M->code[atom_pc].type == OP_NCLASS)) {
if (*sp && match_char(&M->code[atom_pc], (unsigned char)*sp, M->cflags & REG_ICASE))
after_atom = sp + 1;
else
after_atom = sp; /* atom didn't match */
} else {
if (*sp) after_atom = sp + 1;
}
if (after_atom != sp && match_inst(M, pc, after_atom)) return 1;
return match_inst(M, pc, sp);
}
case OP_JMP:
pc += ip->arg + 1;
break;
case OP_SPLIT: {
/* Try first branch (instructions before JMP), then second */
/* This is handled implicitly by the alternation compilation */
pc++;
break;
}
case OP_MATCH:
return 1;
default:
return 0;
}
}
/* Reached end of code = match */
M->sub[1] = sp; /* end of match */
return 1;
}
/* ---- Public API ---- */
int regcomp(regex_t* preg, const char* pattern, int cflags) {
if (!preg || !pattern) return REG_BADPAT;
prog_t* P = (prog_t*)malloc(sizeof(prog_t));
if (!P) return REG_ESPACE;
memset(P, 0, sizeof(*P));
P->cflags = cflags;
int ere = (cflags & REG_EXTENDED) ? 1 : 0;
int icase = (cflags & REG_ICASE) ? 1 : 0;
const char* pp = pattern;
int r = compile_alt(P, &pp, ere, icase);
if (r > 0) { free(P->code); free(P); return r; }
/* Emit MATCH at end */
inst_t* ip = emit(P);
if (!ip) { free(P->code); free(P); return REG_ESPACE; }
ip->type = OP_MATCH;
preg->re_nsub = (size_t)P->nsub;
preg->_priv = P;
return 0;
}
int regexec(const regex_t* preg, const char* string,
size_t nmatch, regmatch_t pmatch[], int eflags) {
if (!preg || !preg->_priv || !string) return REG_NOMATCH;
prog_t* P = (prog_t*)preg->_priv;
mstate_t M;
memset(&M, 0, sizeof(M));
M.code = P->code;
M.codelen = P->len;
M.str = string;
M.str_start = string;
M.eflags = eflags;
M.cflags = P->cflags;
/* Try matching at each position in the string */
for (const char* sp = string; ; sp++) {
memset(M.sub, 0, sizeof(M.sub));
M.sub[0] = sp;
if (match_inst(&M, 0, sp)) {
/* Fill in pmatch */
if (pmatch && nmatch > 0 && !(P->cflags & REG_NOSUB)) {
pmatch[0].rm_so = (regoff_t)(M.sub[0] - string);
pmatch[0].rm_eo = (regoff_t)(M.sub[1] - string);
for (size_t i = 1; i < nmatch; i++) {
if (M.sub[i * 2] && M.sub[i * 2 + 1]) {
pmatch[i].rm_so = (regoff_t)(M.sub[i * 2] - string);
pmatch[i].rm_eo = (regoff_t)(M.sub[i * 2 + 1] - string);
} else {
pmatch[i].rm_so = -1;
pmatch[i].rm_eo = -1;
}
}
}
return 0;
}
if (*sp == '\0') break;
}
return REG_NOMATCH;
}
void regfree(regex_t* preg) {
if (!preg || !preg->_priv) return;
prog_t* P = (prog_t*)preg->_priv;
free(P->code);
free(P);
preg->_priv = (void*)0;
}
size_t regerror(int errcode, const regex_t* preg,
char* errbuf, size_t errbuf_size) {
(void)preg;
static const char* msgs[] = {
"Success",
"No match",
"Invalid regex",
"Invalid collating element",
"Invalid character class",
"Trailing backslash",
"Invalid back reference",
"Unmatched [",
"Unmatched (",
"Unmatched {",
"Invalid range",
"Invalid range end",
"Out of memory",
"Invalid repetition",
};
const char* msg = "Unknown error";
if (errcode >= 0 && errcode <= 13) msg = msgs[errcode];
size_t len = strlen(msg) + 1;
if (errbuf && errbuf_size > 0) {
size_t n = len < errbuf_size ? len : errbuf_size;
memcpy(errbuf, msg, n - 1);
errbuf[n - 1] = '\0';
}
return len;
}