Blob


1 #ifdef __OpenBSD__
2 # define _BSD_SOURCE
3 #elif defined(__GNUC__)
4 # define _DEFAULT_SOURCE
5 # define _GNU_SOURCE
6 #endif
8 #include <stdbool.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <unistd.h>
12 #include <ctype.h>
13 #include <stdio.h>
14 #include <err.h>
16 #define new(T) ((T *)calloc (1, sizeof (T)))
18 enum syntax_type {
19 S_SUB,
20 S_STRING,
21 S_PLUS,
22 S_STAR,
23 S_OPT,
24 S_BINARY,
25 S_ANYOF,
26 S_EXCEPT,
27 S_EPSILON,
28 S_IDENT,
29 };
31 struct syntax {
32 enum syntax_type type;
33 union {
34 struct syntax *syn;
35 char *string;
36 struct {
37 const char *op;
38 struct syntax *left, *right;
39 };
40 struct {
41 char begin, end;
42 };
44 };
45 };
47 struct definition {
48 char *name;
49 struct syntax *syn;
50 };
52 enum Visibility {
53 V_PRIVATE,
54 V_PUBLIC,
55 V_PACKAGE,
56 };
58 static const char *visibilities[] = {
59 "private ",
60 "public ",
61 "",
62 };
64 static const char *strip_ws (const char *s)
65 {
67 while (*s != '\0' && isspace (*s))
68 ++s;
69 return s;
70 }
72 static bool starts_with (const char *s, const char *prefix)
73 {
74 return strncmp (s, prefix, strlen (prefix)) == 0;
75 }
77 static char *parse_string (const char **s, char end)
78 {
79 char ch, *str;
80 size_t len, cap;
81 bool esc = false;
83 len = 0;
84 cap = 10;
85 str = malloc (cap + 1);
87 while (1) {
88 ch = *(*s)++;
89 if (ch == end && !esc)
90 break;
91 if (ch == '\0')
92 return NULL;
94 esc = !esc && (ch == '\\');
96 if (len == cap) {
97 cap *= 2;
98 str = realloc (str, cap + 1);
99 }
101 str[len++] = ch;
103 str[len] = '\0';
104 return str;
107 static char *parse_ident (const char **s)
109 char *str;
110 size_t len, cap;
112 len = 0;
113 cap = 10;
114 str = malloc (cap + 1);
116 while (1) {
117 if (!isalpha (**s))
118 break;
120 if (len == cap) {
121 cap *= 2;
122 str = realloc (str, cap + 1);
124 str[len++] = *(*s)++;
126 str[len] = '\0';
127 return str;
130 static struct syntax *parse_range (struct syntax *syn, const char **s)
132 size_t len, cap;
133 char ch, prev, *str;
135 if (**s == '^') {
136 syn->type = S_EXCEPT;
137 ++*s;
138 } else {
139 syn->type = S_ANYOF;
142 len = 0;
143 cap = 10;
144 str = malloc (cap + 1);
146 while (1) {
147 ch = *(*s)++;
148 switch (ch) {
149 case '\0':
150 return NULL;
151 case ']':
152 goto end;
153 case '-':
154 if (len == 0)
155 return NULL;
156 ch = *(*s)++;
158 if (ch == ']') {
159 if (len == cap) {
160 cap *= 2;
161 str = realloc (str, cap + 1);
163 str[len++] = '-';
164 goto end;
167 prev = str[len - 1];
168 if (prev > ch)
169 return NULL;
171 if ((int)(cap - len) < (ch - prev)) {
172 cap = cap * 2 + (ch - prev);
173 str = realloc (str, cap + 1);
176 for (char i = prev + 1; i <= ch; ++i)
177 str[len++] = i;
178 break;
179 case '\\':
180 ch = *(*s)++;
181 switch (ch) {
182 case 'a': ch = '\a'; break;
183 case 'b': ch = '\b'; break;
184 case 'f': ch = '\f'; break;
185 case 'n': ch = '\n'; break;
186 case 'r': ch = '\r'; break;
187 case 't': ch = '\t'; break;
188 case 'v': ch = '\v'; break;
189 case '\\': ch = '\\'; break;
190 case '\'': ch = '\''; break;
191 case '"': ch = '\"'; break;
192 default: break;
195 /* FALLTHROUGH */
196 default:
197 if (len == cap) {
198 cap *= 2;
199 str = realloc (str, cap + 1);
201 str[len++] = ch;
202 break;
206 end:
207 str[len++] = '\0';
208 syn->string = realloc (str, len);
209 return syn;
212 static struct syntax *parse_syn (const char **s);
213 static struct syntax *syn_atom (const char **s)
215 struct syntax *syn = new (struct syntax);
216 char ch;
218 for (; isspace (**s); ++*s);
220 ch = *(*s)++;
222 switch (ch) {
223 case '(':
224 syn->type = S_SUB;
225 syn->syn = parse_syn (s);
226 if (syn->syn == NULL || **s != ')')
227 return NULL;
228 ++*s;
229 break;
230 case '[':
231 return parse_range (syn, s);
232 case '"':
233 syn->type = S_STRING;
234 syn->string = parse_string (s, '"');
235 if (syn->string == NULL)
236 return NULL;
237 break;
238 case ';':
239 case ')':
240 case '|':
241 case '&':
242 syn->type = S_EPSILON;
243 --*s;
244 break;
245 default:
246 if (!isalpha (ch)) {
247 warnx ("invalid input: '%c'", ch);
248 return NULL;
250 --*s;
251 syn->type = S_IDENT;
252 syn->string = parse_ident (s);
253 break;
256 return syn;
259 static struct syntax *syn_suffix (const char **s)
261 struct syntax *syn, *n;
262 char ch;
264 syn = syn_atom (s);
265 if (syn == NULL)
266 return NULL;
268 while (**s == '*' || **s == '+' || **s == '?') {
269 n = new (struct syntax);
270 ch = *(*s)++;
271 switch (ch) {
272 case '*':
273 n->type = S_STAR;
274 break;
275 case '+':
276 n->type = S_PLUS;
277 break;
278 case '?':
279 n->type = S_OPT;
280 break;
282 n->syn = syn;
283 syn = n;
286 return syn;
289 static struct syntax *syn_andthen (const char **s)
291 struct syntax *syn, *n;
293 syn = syn_suffix (s);
294 if (syn == NULL)
295 return NULL;
297 while (1) {
298 for (; isspace (**s); ++*s);
299 if (**s == ';' || **s == '|' || **s == ')' || **s == '\\' || **s == '&')
300 break;
302 n = new (struct syntax);
303 n->type = S_BINARY;
304 n->op = "andThen";
305 n->left = syn;
306 n->right = syn_suffix (s);
307 if (n->right == NULL)
308 return NULL;
309 syn = n;
312 return syn;
315 static struct syntax *syn_butonly (const char **s)
317 struct syntax *syn, *n;
319 syn = syn_andthen (s);
320 if (syn == NULL)
321 return NULL;
323 while (1) {
324 for (; isspace (**s); ++*s);
325 if (**s != '&')
326 break;
327 ++*s;
329 n = new (struct syntax);
330 n->type = S_BINARY;
331 n->op = "butOnly";
332 n->left = syn;
333 n->right = syn_andthen (s);
334 if (n->right == NULL)
335 return NULL;
336 syn = n;
339 return syn;
342 static struct syntax *syn_butnot (const char **s)
344 struct syntax *syn, *n;
346 syn = syn_butonly (s);
347 if (syn == NULL)
348 return NULL;
350 while (1) {
351 for (; isspace (**s); ++*s);
352 if (**s != '\\')
353 break;
354 ++*s;
356 n = new (struct syntax);
357 n->type = S_BINARY;
358 n->op = "butNot";
359 n->left = syn;
360 n->right = syn_butonly (s);
361 if (n->right == NULL)
362 return NULL;
363 syn = n;
366 return syn;
369 static struct syntax *syn_or (const char **s)
371 struct syntax *syn, *n;
373 syn = syn_butnot (s);
374 if (syn == NULL)
375 return NULL;
377 while (1) {
378 for (; isspace (**s); ++*s);
379 if (**s != '|')
380 break;
381 ++*s;
383 n = new (struct syntax);
384 n->type = S_BINARY;
385 n->op = "orElse";
386 n->left = syn;
387 n->right = syn_butnot (s);
388 if (n->right == NULL)
389 return NULL;
390 syn = n;
393 return syn;
396 static struct syntax *parse_syn (const char **s)
398 return syn_or (s);
401 static void print_char (FILE *file, char ch)
403 char str[3] = { '\\', '\0', '\0' };
404 switch (ch) {
405 case '\a': str[1] = 'a'; break;
406 case '\b': str[1] = 'b'; break;
407 case '\f': str[1] = 'f'; break;
408 case '\n': str[1] = 'n'; break;
409 case '\r': str[1] = 'r'; break;
410 case '\t': str[1] = 't'; break;
411 case '\v': str[1] = 'v'; break;
412 case '\\': str[1] = '\\'; break;
413 case '\'': str[1] = '\''; break;
414 default:
415 fprintf (file, "'%c'", ch);
416 return;
418 fprintf (file, "'%s'", str);
421 static void print_syntax (FILE *file, const struct syntax *syn, int nesting)
423 switch (syn->type) {
424 case S_EPSILON:
425 fprintf (file, "TokenFragment.epsilon()");
426 break;
427 case S_SUB:
428 print_syntax (file, syn->syn, nesting);
429 break;
430 case S_STRING:
431 fprintf (file, "TokenFragment.of(\"%s\")", syn->string);
432 break;
433 case S_ANYOF:
434 case S_EXCEPT:
435 fprintf (file, "TokenFragment.%s(", syn->type == S_ANYOF ? "anyOf" : "except");
436 print_char (file, syn->string[0]);
437 for (size_t i = 1; syn->string[i] != '\0'; ++i) {
438 fputs (", ", file);
439 print_char (file, syn->string[i]);
441 fputc (')', file);
442 break;
443 case S_PLUS:
444 print_syntax (file, syn->syn, nesting);
445 fprintf (file, ".plus()");
446 break;
447 case S_STAR:
448 print_syntax (file, syn->syn, nesting);
449 fprintf (file, ".star()");
450 break;
451 case S_OPT:
452 print_syntax (file, syn->syn, nesting);
453 fprintf (file, ".optional()");
454 break;
455 case S_BINARY:
456 print_syntax (file, syn->left, nesting);
457 ++nesting;
458 fputc ('\n', file);
459 for (int i = 0; i < nesting; ++i)
460 fputc ('\t', file);
461 fprintf (file, ".%s(", syn->op);
462 print_syntax (file, syn->right, nesting);
463 fprintf (file, ")");
464 break;
465 case S_IDENT:
466 fprintf (file, "%c%s", tolower (syn->string[0]), syn->string + 1);
467 break;
471 static int parse_def (const char *s, struct definition *def)
473 size_t i;
475 for (i = 0; s[i] != '\0' && isalpha (s[i]); ++i);
476 if (i == 0)
477 return 1;
479 def->name = strndup (s, i);
480 s += i;
482 for (; isspace (*s); ++s);
484 if (*s++ != ':')
485 return 1;
487 def->syn = parse_syn (&s);
488 if (def->syn == NULL)
489 return 1;
491 if (*s != ';')
492 return 1;
494 return 0;
497 static int parse_vardef (const char *s, struct definition *def)
499 size_t i;
501 for (i = 0; s[i] != '\0' && isalpha (s[i]); ++i);
502 if (i == 0)
503 return 1;
505 def->name = strndup (s, i);
506 s += i;
508 for (; isspace (*s); ++s);
509 if (*s++ != '=')
510 return 1;
512 def->syn = parse_syn (&s);
513 if (def->syn == NULL)
514 return 1;
516 if (*s != ';')
517 return 1;
519 return 0;
522 static int compile (FILE *infile, const char *infilename, FILE *outfile, const char *vis, const char *package, const char *classname)
524 size_t linenum = 0, ntok, tokcap, nvar, varcap;
525 char line[1024];
526 const char *s;
527 struct definition *tokens, *vars;
528 char *eof = NULL;
529 char *error = NULL;
531 if (package != NULL)
532 fprintf (outfile, "package %s;\n\n", package);
534 nvar = 0;
535 varcap = 10;
536 vars = calloc (varcap, sizeof (struct definition));
538 // Parse declarations
539 while (1) {
540 ++linenum;
541 if (fgets (line, sizeof (line), infile) == NULL) {
542 warnx ("%s: %zu: unexpected end of file", infilename, linenum);
543 return 1;
546 line[strcspn (line, "\n")] = '\0';
547 s = strip_ws (line);
548 if (*s == '\0' || starts_with (s, "//"))
549 continue;
551 if (strcmp (s, "%%") == 0)
552 break;
554 if (starts_with (s, "%import ")) {
555 fprintf (outfile, "%s;\n", s + 1);
556 } else if (starts_with (s, "%eof ")) {
557 s = strip_ws (s + 4);
558 eof = strdup (s);
559 } else if (starts_with (s, "%error ")) {
560 s = strip_ws (s + 6);
561 error = strdup (s);
562 } else {
563 if (nvar == varcap) {
564 varcap *= 2;
565 vars = reallocarray (vars, varcap, sizeof (struct definition));
568 if (parse_vardef (s, &vars[nvar++]) != 0) {
569 warnx ("%s: %zu: unrecognized statement: %s", infilename, linenum, s);
570 return 1;
575 if (eof == NULL)
576 warnx ("%s: %zu: end-of-file token not defined. Use `%%eof NAME` to define an EOF token.", infilename, linenum);
577 if (error == NULL)
578 warnx ("%s: %zu: error token not defined. Use `%%error NAME` to define an error token.", infilename, linenum);
580 fprintf (outfile, "\npublic class %s {\n", classname);
582 ntok = 0;
583 tokcap = 10;
584 tokens = calloc (tokcap, sizeof (struct definition));
586 // Parse definitions
587 while (fgets (line, sizeof (line), infile) != NULL) {
588 ++linenum;
589 line[strcspn (line, "\n")] = '\0';
590 s = strip_ws (line);
591 if (*s == '\0' || starts_with (s, "//"))
592 continue;
594 if (strcmp (s, "%%") == 0)
595 break;
597 if (ntok == tokcap) {
598 tokcap *= 2;
599 tokens = reallocarray (tokens, tokcap, sizeof (struct definition));
602 if (parse_def (s, &tokens[ntok++]) != 0) {
603 warnx ("%s: %zu: invalid token definition", infilename, linenum);
604 return 1;
608 if (ntok == 0) {
609 warnx ("%s: %zu: no token definitions", infilename, linenum);
610 return 1;
613 fprintf (outfile, "\t%sstatic enum TokenType {\n", vis);
614 if (eof != NULL)
615 fprintf (outfile, "\t\t%s,\n", eof);
616 if (error != NULL)
617 fprintf (outfile, "\t\t%s,\n", error);
618 for (size_t i = 0; i < ntok; ++i) {
619 fprintf (outfile, "\t\t%s,\n", tokens[i].name);
621 fprintf (outfile, "\t}\n\n");
623 if (nvar > 0) {
624 fprintf (outfile, "\t// Variable definitions\n");
625 for (size_t i = 0; i < nvar; ++i) {
626 const struct definition *var = &vars[i];
627 fprintf (outfile,
628 "\t%sstatic final TokenFragment %c%s = \n\t\t",
629 vis,
630 tolower (var->name[0]),
631 var->name + 1);
632 print_syntax (outfile, var->syn, 2);
633 fprintf (outfile, ";\n");
635 fputc ('\n', outfile);
639 fprintf (outfile, "\t// Token Definitions\n");
640 for (size_t i = 0; i < ntok; ++i) {
641 const struct definition *tok = &tokens[i];
642 fprintf (outfile,
643 "\t%sstatic final TokenRule<TokenType> %c%s = new TokenRule<>(TokenType.%s,\n\t\t",
644 vis,
645 tolower (tok->name[0]),
646 tok->name + 1,
647 tok->name);
648 print_syntax (outfile, tok->syn, 2);
649 fprintf (outfile, "\n\t);\n");
651 fputc ('\n', outfile);
653 fprintf (outfile, "\t%sstatic final Lexer construct() {\n", vis);
654 fprintf (outfile, "\t\tfinal var rules = new java.util.ArrayList<TokenRule<TokenType>>();\n");
655 for (size_t i = 0; i < ntok; ++i) {
656 const struct definition *tok = &tokens[i];
657 fprintf (outfile, "\t\trules.add(%c%s);\n", tolower (tok->name[0]), tok->name + 1);
660 fprintf (outfile, "\t\treturn new Lexer(new TokenRuleSet(rules))");
661 if (eof)
662 fprintf (outfile, "\n\t\t\t.setEndType(TokenType.%s)", eof);
663 if (error)
664 fprintf (outfile, "\n\t\t\t.setErrorType(TokenType.%s)", error);
665 fprintf (outfile, ";\n\t}\n\n");
667 while (fgets (line, sizeof (line), infile) != NULL) {
668 fputc ('\t', outfile);
669 fputs (line, outfile);
672 fputs ("}\n", outfile);
674 return 0;
680 static int usage (void)
682 fputs ("usage: lexicgen [-v VISIBILITY] [-p PACKAGE] [-o FILE] file\n", stderr);
683 return 1;
686 static int gen_names (const char *filename, char **outfilename, char **classname)
688 const char *slash, *dot;
690 slash = strrchr (filename, '/');
691 if (slash == NULL)
692 slash = filename - 1;
694 dot = strchr (slash + 1, '.');
695 if (dot == NULL)
696 dot = slash + 1 + strlen (slash + 1);
698 if ((dot - slash) < 2)
699 return -1;
701 if (outfilename != NULL)
702 asprintf (outfilename, "%.*s.java", (int)(dot - filename), filename);
703 asprintf (classname, "%.*s", (int)(dot - slash - 1), slash + 1);
704 return 0;
708 int main (int argc, char *argv[])
710 int option;
711 char *package = NULL;
712 char *infilename = NULL, *outfilename = NULL, *classname;
713 FILE *infile, *outfile;
714 enum Visibility vis = V_PRIVATE;
715 int ret;
717 #ifdef __OpenBSD__
718 if (pledge ("stdio rpath wpath cpath", NULL) != 0)
719 err (1, "pledge()");
720 #endif
722 while ((option = getopt (argc, argv, "p:v:o:")) != -1) {
723 switch (option) {
724 case 'p':
725 package = optarg;
726 break;
727 case 'v':
728 if (strcmp (optarg, "public") == 0) {
729 vis = V_PUBLIC;
730 } else if (strcmp (optarg, "private") == 0) {
731 vis = V_PRIVATE;
732 } else if (strcmp (optarg, "package") == 0) {
733 vis = V_PACKAGE;
734 } else {
735 errx (1, "Invalid visibility: '%s', expected any of: private, public, package", optarg);
737 break;
738 case 'o':
739 outfilename = optarg;
740 break;
741 default:
742 return usage ();
746 argv += optind;
747 argc -= optind;
749 if (argc != 1)
750 return usage ();
752 infilename = argv[0];
753 if (gen_names (infilename, outfilename ? NULL : &outfilename, &classname) != 0)
754 errx (1, "invalid file name: %s", infilename);
756 infile = fopen (infilename, "r");
757 if (infile == NULL)
758 err (1, "fopen(\"%s\")", infilename);
760 if (strcmp (outfilename, "-") == 0) {
761 outfile = stdout;
762 } else {
763 outfile = fopen (outfilename, "w");
764 if (outfile == NULL)
765 err (1, "fopen(\"%s\")", outfilename);
768 #ifdef __OpenBSD__
769 if (pledge ("stdio", NULL) != 0)
770 err (1, "pledge()");
771 #endif
773 ret = compile (infile, infilename, outfile, visibilities[vis], package, classname);
775 fclose (outfile);
776 fclose (infile);
777 if (ret != 0)
778 remove (outfilename);
779 return ret;