#ifdef __OpenBSD__ # define _BSD_SOURCE #elif defined(__GNUC__) # define _DEFAULT_SOURCE # define _GNU_SOURCE #endif #include #include #include #include #include #include #include #define new(T) ((T *)calloc (1, sizeof (T))) enum syntax_type { S_SUB, S_STRING, S_PLUS, S_STAR, S_OPT, S_BINARY, S_ANYOF, S_EXCEPT, S_EPSILON, S_IDENT, }; struct syntax { enum syntax_type type; union { struct syntax *syn; char *string; struct { const char *op; struct syntax *left, *right; }; struct { char begin, end; }; }; }; struct definition { char *name; struct syntax *syn; }; enum Visibility { V_PRIVATE, V_PUBLIC, V_PACKAGE, }; static const char *visibilities[] = { "private ", "public ", "", }; static const char *strip_ws (const char *s) { while (*s != '\0' && isspace (*s)) ++s; return s; } static bool starts_with (const char *s, const char *prefix) { return strncmp (s, prefix, strlen (prefix)) == 0; } static char *parse_string (const char **s, char end) { char ch, *str; size_t len, cap; bool esc = false; len = 0; cap = 10; str = malloc (cap + 1); while (1) { ch = *(*s)++; if (ch == end && !esc) break; if (ch == '\0') return NULL; esc = !esc && (ch == '\\'); if (len == cap) { cap *= 2; str = realloc (str, cap + 1); } str[len++] = ch; } str[len] = '\0'; return str; } static char *parse_ident (const char **s) { char *str; size_t len, cap; len = 0; cap = 10; str = malloc (cap + 1); while (1) { if (!isalpha (**s)) break; if (len == cap) { cap *= 2; str = realloc (str, cap + 1); } str[len++] = *(*s)++; } str[len] = '\0'; return str; } static struct syntax *parse_range (struct syntax *syn, const char **s) { size_t len, cap; char ch, prev, *str; if (**s == '^') { syn->type = S_EXCEPT; ++*s; } else { syn->type = S_ANYOF; } len = 0; cap = 10; str = malloc (cap + 1); while (1) { ch = *(*s)++; switch (ch) { case '\0': return NULL; case ']': goto end; case '-': if (len == 0) return NULL; ch = *(*s)++; if (ch == ']') { if (len == cap) { cap *= 2; str = realloc (str, cap + 1); } str[len++] = '-'; goto end; } prev = str[len - 1]; if (prev > ch) return NULL; if ((int)(cap - len) < (ch - prev)) { cap = cap * 2 + (ch - prev); str = realloc (str, cap + 1); } for (char i = prev + 1; i <= ch; ++i) str[len++] = i; break; case '\\': ch = *(*s)++; switch (ch) { case 'a': ch = '\a'; break; case 'b': ch = '\b'; break; case 'f': ch = '\f'; break; case 'n': ch = '\n'; break; case 'r': ch = '\r'; break; case 't': ch = '\t'; break; case 'v': ch = '\v'; break; case '\\': ch = '\\'; break; case '\'': ch = '\''; break; case '"': ch = '\"'; break; default: break; } /* FALLTHROUGH */ default: if (len == cap) { cap *= 2; str = realloc (str, cap + 1); } str[len++] = ch; break; } } end: str[len++] = '\0'; syn->string = realloc (str, len); return syn; } static struct syntax *parse_syn (const char **s); static struct syntax *syn_atom (const char **s) { struct syntax *syn = new (struct syntax); char ch; for (; isspace (**s); ++*s); ch = *(*s)++; switch (ch) { case '(': syn->type = S_SUB; syn->syn = parse_syn (s); if (syn->syn == NULL || **s != ')') return NULL; ++*s; break; case '[': return parse_range (syn, s); case '"': syn->type = S_STRING; syn->string = parse_string (s, '"'); if (syn->string == NULL) return NULL; break; case ';': case ')': case '|': case '&': syn->type = S_EPSILON; --*s; break; default: if (!isalpha (ch)) { warnx ("invalid input: '%c'", ch); return NULL; } --*s; syn->type = S_IDENT; syn->string = parse_ident (s); break; } return syn; } static struct syntax *syn_suffix (const char **s) { struct syntax *syn, *n; char ch; syn = syn_atom (s); if (syn == NULL) return NULL; while (**s == '*' || **s == '+' || **s == '?') { n = new (struct syntax); ch = *(*s)++; switch (ch) { case '*': n->type = S_STAR; break; case '+': n->type = S_PLUS; break; case '?': n->type = S_OPT; break; } n->syn = syn; syn = n; } return syn; } static struct syntax *syn_andthen (const char **s) { struct syntax *syn, *n; syn = syn_suffix (s); if (syn == NULL) return NULL; while (1) { for (; isspace (**s); ++*s); if (**s == ';' || **s == '|' || **s == ')' || **s == '\\' || **s == '&') break; n = new (struct syntax); n->type = S_BINARY; n->op = "andThen"; n->left = syn; n->right = syn_suffix (s); if (n->right == NULL) return NULL; syn = n; } return syn; } static struct syntax *syn_butonly (const char **s) { struct syntax *syn, *n; syn = syn_andthen (s); if (syn == NULL) return NULL; while (1) { for (; isspace (**s); ++*s); if (**s != '&') break; ++*s; n = new (struct syntax); n->type = S_BINARY; n->op = "butOnly"; n->left = syn; n->right = syn_andthen (s); if (n->right == NULL) return NULL; syn = n; } return syn; } static struct syntax *syn_butnot (const char **s) { struct syntax *syn, *n; syn = syn_butonly (s); if (syn == NULL) return NULL; while (1) { for (; isspace (**s); ++*s); if (**s != '\\') break; ++*s; n = new (struct syntax); n->type = S_BINARY; n->op = "butNot"; n->left = syn; n->right = syn_butonly (s); if (n->right == NULL) return NULL; syn = n; } return syn; } static struct syntax *syn_or (const char **s) { struct syntax *syn, *n; syn = syn_butnot (s); if (syn == NULL) return NULL; while (1) { for (; isspace (**s); ++*s); if (**s != '|') break; ++*s; n = new (struct syntax); n->type = S_BINARY; n->op = "orElse"; n->left = syn; n->right = syn_butnot (s); if (n->right == NULL) return NULL; syn = n; } return syn; } static struct syntax *parse_syn (const char **s) { return syn_or (s); } static void print_char (FILE *file, char ch) { char str[3] = { '\\', '\0', '\0' }; switch (ch) { case '\a': str[1] = 'a'; break; case '\b': str[1] = 'b'; break; case '\f': str[1] = 'f'; break; case '\n': str[1] = 'n'; break; case '\r': str[1] = 'r'; break; case '\t': str[1] = 't'; break; case '\v': str[1] = 'v'; break; case '\\': str[1] = '\\'; break; case '\'': str[1] = '\''; break; default: fprintf (file, "'%c'", ch); return; } fprintf (file, "'%s'", str); } static void print_syntax (FILE *file, const struct syntax *syn, int nesting) { switch (syn->type) { case S_EPSILON: fprintf (file, "TokenFragment.epsilon()"); break; case S_SUB: print_syntax (file, syn->syn, nesting); break; case S_STRING: fprintf (file, "TokenFragment.of(\"%s\")", syn->string); break; case S_ANYOF: case S_EXCEPT: fprintf (file, "TokenFragment.%s(", syn->type == S_ANYOF ? "anyOf" : "except"); print_char (file, syn->string[0]); for (size_t i = 1; syn->string[i] != '\0'; ++i) { fputs (", ", file); print_char (file, syn->string[i]); } fputc (')', file); break; case S_PLUS: print_syntax (file, syn->syn, nesting); fprintf (file, ".plus()"); break; case S_STAR: print_syntax (file, syn->syn, nesting); fprintf (file, ".star()"); break; case S_OPT: print_syntax (file, syn->syn, nesting); fprintf (file, ".optional()"); break; case S_BINARY: print_syntax (file, syn->left, nesting); ++nesting; fputc ('\n', file); for (int i = 0; i < nesting; ++i) fputc ('\t', file); fprintf (file, ".%s(", syn->op); print_syntax (file, syn->right, nesting); fprintf (file, ")"); break; case S_IDENT: fprintf (file, "%c%s", tolower (syn->string[0]), syn->string + 1); break; } } static int parse_def (const char *s, struct definition *def) { size_t i; for (i = 0; s[i] != '\0' && isalpha (s[i]); ++i); if (i == 0) return 1; def->name = strndup (s, i); s += i; for (; isspace (*s); ++s); if (*s++ != ':') return 1; def->syn = parse_syn (&s); if (def->syn == NULL) return 1; if (*s != ';') return 1; return 0; } static int parse_vardef (const char *s, struct definition *def) { size_t i; for (i = 0; s[i] != '\0' && isalpha (s[i]); ++i); if (i == 0) return 1; def->name = strndup (s, i); s += i; for (; isspace (*s); ++s); if (*s++ != '=') return 1; def->syn = parse_syn (&s); if (def->syn == NULL) return 1; if (*s != ';') return 1; return 0; } static int compile (FILE *infile, const char *infilename, FILE *outfile, const char *vis, const char *package, const char *classname) { size_t linenum = 0, ntok, tokcap, nvar, varcap; char line[1024]; const char *s; struct definition *tokens, *vars; char *eof = NULL; char *error = NULL; if (package != NULL) fprintf (outfile, "package %s;\n\n", package); nvar = 0; varcap = 10; vars = calloc (varcap, sizeof (struct definition)); // Parse declarations while (1) { ++linenum; if (fgets (line, sizeof (line), infile) == NULL) { warnx ("%s: %zu: unexpected end of file", infilename, linenum); return 1; } line[strcspn (line, "\n")] = '\0'; s = strip_ws (line); if (*s == '\0' || starts_with (s, "//")) continue; if (strcmp (s, "%%") == 0) break; if (starts_with (s, "%import ")) { fprintf (outfile, "%s;\n", s + 1); } else if (starts_with (s, "%eof ")) { s = strip_ws (s + 4); eof = strdup (s); } else if (starts_with (s, "%error ")) { s = strip_ws (s + 6); error = strdup (s); } else { if (nvar == varcap) { varcap *= 2; vars = reallocarray (vars, varcap, sizeof (struct definition)); } if (parse_vardef (s, &vars[nvar++]) != 0) { warnx ("%s: %zu: unrecognized statement: %s", infilename, linenum, s); return 1; } } } if (eof == NULL) warnx ("%s: %zu: end-of-file token not defined. Use `%%eof NAME` to define an EOF token.", infilename, linenum); if (error == NULL) warnx ("%s: %zu: error token not defined. Use `%%error NAME` to define an error token.", infilename, linenum); fprintf (outfile, "\npublic class %s {\n", classname); ntok = 0; tokcap = 10; tokens = calloc (tokcap, sizeof (struct definition)); // Parse definitions while (fgets (line, sizeof (line), infile) != NULL) { ++linenum; line[strcspn (line, "\n")] = '\0'; s = strip_ws (line); if (*s == '\0' || starts_with (s, "//")) continue; if (strcmp (s, "%%") == 0) break; if (ntok == tokcap) { tokcap *= 2; tokens = reallocarray (tokens, tokcap, sizeof (struct definition)); } if (parse_def (s, &tokens[ntok++]) != 0) { warnx ("%s: %zu: invalid token definition", infilename, linenum); return 1; } } if (ntok == 0) { warnx ("%s: %zu: no token definitions", infilename, linenum); return 1; } fprintf (outfile, "\t%sstatic enum TokenType {\n", vis); if (eof != NULL) fprintf (outfile, "\t\t%s,\n", eof); if (error != NULL) fprintf (outfile, "\t\t%s,\n", error); for (size_t i = 0; i < ntok; ++i) { fprintf (outfile, "\t\t%s,\n", tokens[i].name); } fprintf (outfile, "\t}\n\n"); if (nvar > 0) { fprintf (outfile, "\t// Variable definitions\n"); for (size_t i = 0; i < nvar; ++i) { const struct definition *var = &vars[i]; fprintf (outfile, "\t%sstatic final TokenFragment %c%s = \n\t\t", vis, tolower (var->name[0]), var->name + 1); print_syntax (outfile, var->syn, 2); fprintf (outfile, ";\n"); } fputc ('\n', outfile); } fprintf (outfile, "\t// Token Definitions\n"); for (size_t i = 0; i < ntok; ++i) { const struct definition *tok = &tokens[i]; fprintf (outfile, "\t%sstatic final TokenRule %c%s = new TokenRule<>(TokenType.%s,\n\t\t", vis, tolower (tok->name[0]), tok->name + 1, tok->name); print_syntax (outfile, tok->syn, 2); fprintf (outfile, "\n\t);\n"); } fputc ('\n', outfile); fprintf (outfile, "\t%sstatic final Lexer construct() {\n", vis); fprintf (outfile, "\t\tfinal var rules = new java.util.ArrayList>();\n"); for (size_t i = 0; i < ntok; ++i) { const struct definition *tok = &tokens[i]; fprintf (outfile, "\t\trules.add(%c%s);\n", tolower (tok->name[0]), tok->name + 1); } fprintf (outfile, "\t\treturn new Lexer(new TokenRuleSet(rules))"); if (eof) fprintf (outfile, "\n\t\t\t.setEndType(TokenType.%s)", eof); if (error) fprintf (outfile, "\n\t\t\t.setErrorType(TokenType.%s)", error); fprintf (outfile, ";\n\t}\n\n"); while (fgets (line, sizeof (line), infile) != NULL) { fputc ('\t', outfile); fputs (line, outfile); } fputs ("}\n", outfile); return 0; } static int usage (void) { fputs ("usage: lexicgen [-v VISIBILITY] [-p PACKAGE] [-o FILE] file\n", stderr); return 1; } static int gen_names (const char *filename, char **outfilename, char **classname) { const char *slash, *dot; slash = strrchr (filename, '/'); if (slash == NULL) slash = filename - 1; dot = strchr (slash + 1, '.'); if (dot == NULL) dot = slash + 1 + strlen (slash + 1); if ((dot - slash) < 2) return -1; if (outfilename != NULL) asprintf (outfilename, "%.*s.java", (int)(dot - filename), filename); asprintf (classname, "%.*s", (int)(dot - slash - 1), slash + 1); return 0; } int main (int argc, char *argv[]) { int option; char *package = NULL; char *infilename = NULL, *outfilename = NULL, *classname; FILE *infile, *outfile; enum Visibility vis = V_PRIVATE; int ret; #ifdef __OpenBSD__ if (pledge ("stdio rpath wpath cpath", NULL) != 0) err (1, "pledge()"); #endif while ((option = getopt (argc, argv, "p:v:o:")) != -1) { switch (option) { case 'p': package = optarg; break; case 'v': if (strcmp (optarg, "public") == 0) { vis = V_PUBLIC; } else if (strcmp (optarg, "private") == 0) { vis = V_PRIVATE; } else if (strcmp (optarg, "package") == 0) { vis = V_PACKAGE; } else { errx (1, "Invalid visibility: '%s', expected any of: private, public, package", optarg); } break; case 'o': outfilename = optarg; break; default: return usage (); } } argv += optind; argc -= optind; if (argc != 1) return usage (); infilename = argv[0]; if (gen_names (infilename, outfilename ? NULL : &outfilename, &classname) != 0) errx (1, "invalid file name: %s", infilename); infile = fopen (infilename, "r"); if (infile == NULL) err (1, "fopen(\"%s\")", infilename); if (strcmp (outfilename, "-") == 0) { outfile = stdout; } else { outfile = fopen (outfilename, "w"); if (outfile == NULL) err (1, "fopen(\"%s\")", outfilename); } #ifdef __OpenBSD__ if (pledge ("stdio", NULL) != 0) err (1, "pledge()"); #endif ret = compile (infile, infilename, outfile, visibilities[vis], package, classname); fclose (outfile); fclose (infile); if (ret != 0) remove (outfilename); return ret; }