Make roff_expand() parse left-to-right rather than right-to-left.
authorschwarze <schwarze@openbsd.org>
Thu, 19 May 2022 15:17:50 +0000 (15:17 +0000)
committerschwarze <schwarze@openbsd.org>
Thu, 19 May 2022 15:17:50 +0000 (15:17 +0000)
Some escape sequences have side effects on global state, implying
that the order of evaluation matters.  For example, this fixes the
long-standing bug that "\n+x\n+x\n+x" after ".nr x 0 1" used to
print "321"; now it correctly prints "123".

Right-to-left parsing was convenient because it implicitly handled
nested escape sequences.  With correct left-to-right parsing, nesting
now requires an explicit implementation, here solved as follows:
1. Handle nested expanding escape sequences iteratively.
When finding one, expand it, then retry parsing the enclosing escape
sequence from the beginning, which will ultimately succeed as soon
as it no longer contains any nested expanding escape sequences.
2. Handle nested non-expanding escape sequences recursively.
When finding one, the escape sequence parser calls itself to find
the end of the inner sequence, then continues parsing the outer
sequence after that point.

This requires the mandoc_escape() function to operate in two different
modes.  The roff(7) parser uses it in a mode where it generates
diagnostics and may return an expansion request instead of a parse
result.  All other callers, in particular the formatters, use it
in a simpler mode that never generates diagnostics and always returns
a definite parsing result, but that requires all expanding escape
sequences to already have been expanded earlier.  The bulk of the
code is the same for both modes.
Since this required a major rewrite of the function anyway, move
it into its own new file roff_escape.c and out of the file mandoc.c,
which was misnamed in the first place and lacks a clear focus.

As a side benefit, this also fixes a number of assertion failures
that tb@ found with afl(1), for example "\n\\\\*0", "\v\-\\*0",
and "\w\-\\\\\$0*0".

As another side benefit, it also resolves some code duplication
between mandoc_escape() and roff_expand() and centralizes all
handling of escape sequences (except for expansion) in roff_escape.c,
hopefully easing maintenance and feature improvements in the future.

While here, also move end-of-input handling out of the complicated
function roff_expand() and into the simpler function roff_parse_comment(),
making the logic easier to understand.

Since this is a major reorganization of a central component of
mandoc(1), stability of the program might slightly suffer for a few
weeks, but i believe that's not a problem at this point of the
release cycle.  The new code already satisfies the regression suite,
but more tweaking and regression testing to further improve the
handling of various escape sequences will likely follow in the near
future.

usr.bin/mandoc/Makefile
usr.bin/mandoc/mandoc.c
usr.bin/mandoc/mandoc.h
usr.bin/mandoc/roff.c
usr.bin/mandoc/roff_escape.c [new file with mode: 0644]
usr.bin/mandoc/roff_int.h

index 8d6fec5..f53c608 100644 (file)
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile,v 1.118 2020/03/13 00:31:04 schwarze Exp $
+# $OpenBSD: Makefile,v 1.119 2022/05/19 15:17:50 schwarze Exp $
 
 .include <bsd.own.mk>
 
@@ -8,7 +8,8 @@ LDADD   += -lutil -lz
 
 SRCS=  mandoc_aux.c mandoc_ohash.c mandoc.c mandoc_msg.c mandoc_xr.c \
        arch.c chars.c msec.c preconv.c read.c tag.c
-SRCS+= roff.c roff_validate.c tbl.c tbl_opts.c tbl_layout.c tbl_data.c eqn.c
+SRCS+= roff.c roff_escape.c roff_validate.c
+SRCS+= tbl.c tbl_opts.c tbl_layout.c tbl_data.c eqn.c
 SRCS+= mdoc.c mdoc_argv.c mdoc_macro.c mdoc_state.c mdoc_validate.c \
        att.c st.c
 SRCS+= man_macro.c man.c man_validate.c
index ce710c6..26861a9 100644 (file)
@@ -1,7 +1,8 @@
-/* $OpenBSD: mandoc.c,v 1.88 2022/04/13 13:11:33 schwarze Exp $ */
+/* $OpenBSD: mandoc.c,v 1.89 2022/05/19 15:17:50 schwarze Exp $ */
 /*
- * Copyright (c) 2011-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
- * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2010, 2011, 2015, 2017, 2018, 2019, 2020, 2021
+ *               Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Utility functions to handle end of sentence punctuation
+ * and dates and times, for use by mdoc(7) and man(7) parsers.
+ * Utility functions to handle fonts and numbers,
+ * for use by mandoc(1) parsers and formatters.
  */
 #include <sys/types.h>
 
@@ -89,388 +95,6 @@ mandoc_font(const char *cp, int sz)
        }
 }
 
-enum mandoc_esc
-mandoc_escape(const char **end, const char **start, int *sz)
-{
-       const char      *local_start;
-       int              local_sz, c, i;
-       char             term;
-       enum mandoc_esc  gly;
-
-       /*
-        * When the caller doesn't provide return storage,
-        * use local storage.
-        */
-
-       if (NULL == start)
-               start = &local_start;
-       if (NULL == sz)
-               sz = &local_sz;
-
-       /*
-        * Treat "\E" just like "\";
-        * it only makes a difference in copy mode.
-        */
-
-       while (**end == 'E')
-               ++*end;
-
-       /*
-        * Beyond the backslash, at least one input character
-        * is part of the escape sequence.  With one exception
-        * (see below), that character won't be returned.
-        */
-
-       gly = ESCAPE_ERROR;
-       *start = ++*end;
-       *sz = 0;
-       term = '\0';
-
-       switch ((*start)[-1]) {
-       /*
-        * First the glyphs.  There are several different forms of
-        * these, but each eventually returns a substring of the glyph
-        * name.
-        */
-       case '(':
-               gly = ESCAPE_SPECIAL;
-               *sz = 2;
-               break;
-       case '[':
-               if (**start == ' ') {
-                       ++*end;
-                       return ESCAPE_ERROR;
-               }
-               gly = ESCAPE_SPECIAL;
-               term = ']';
-               break;
-       case 'C':
-               if ('\'' != **start)
-                       return ESCAPE_ERROR;
-               *start = ++*end;
-               gly = ESCAPE_SPECIAL;
-               term = '\'';
-               break;
-
-       /*
-        * Escapes taking no arguments at all.
-        */
-       case '!':
-       case '?':
-               return ESCAPE_UNSUPP;
-       case '%':
-       case '&':
-       case ')':
-       case ',':
-       case '/':
-       case '^':
-       case 'a':
-       case 'd':
-       case 'r':
-       case 't':
-       case 'u':
-       case '{':
-       case '|':
-       case '}':
-               return ESCAPE_IGNORE;
-       case 'c':
-               return ESCAPE_NOSPACE;
-       case 'p':
-               return ESCAPE_BREAK;
-
-       /*
-        * The \z escape is supposed to output the following
-        * character without advancing the cursor position.
-        * Since we are mostly dealing with terminal mode,
-        * let us just skip the next character.
-        */
-       case 'z':
-               return ESCAPE_SKIPCHAR;
-
-       /*
-        * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
-        * 'X' is the trigger.  These have opaque sub-strings.
-        */
-       case 'F':
-       case 'f':
-       case 'g':
-       case 'k':
-       case 'M':
-       case 'm':
-       case 'n':
-       case 'O':
-       case 'V':
-       case 'Y':
-       case '*':
-               switch ((*start)[-1]) {
-               case 'f':
-                       gly = ESCAPE_FONT;
-                       break;
-               case '*':
-                       gly = ESCAPE_DEVICE;
-                       break;
-               default:
-                       gly = ESCAPE_IGNORE;
-                       break;
-               }
-               switch (**start) {
-               case '(':
-                       if ((*start)[-1] == 'O')
-                               gly = ESCAPE_ERROR;
-                       *start = ++*end;
-                       *sz = 2;
-                       break;
-               case '[':
-                       if ((*start)[-1] == 'O')
-                               gly = (*start)[1] == '5' ?
-                                   ESCAPE_UNSUPP : ESCAPE_ERROR;
-                       *start = ++*end;
-                       term = ']';
-                       break;
-               default:
-                       if ((*start)[-1] == 'O') {
-                               switch (**start) {
-                               case '0':
-                                       gly = ESCAPE_UNSUPP;
-                                       break;
-                               case '1':
-                               case '2':
-                               case '3':
-                               case '4':
-                                       break;
-                               default:
-                                       gly = ESCAPE_ERROR;
-                                       break;
-                               }
-                       }
-                       *sz = 1;
-                       break;
-               }
-               break;
-
-       /*
-        * These escapes are of the form \X'Y', where 'X' is the trigger
-        * and 'Y' is any string.  These have opaque sub-strings.
-        * The \B and \w escapes are handled in roff.c, roff_res().
-        */
-       case 'A':
-       case 'b':
-       case 'D':
-       case 'R':
-       case 'X':
-       case 'Z':
-               gly = ESCAPE_IGNORE;
-               /* FALLTHROUGH */
-       case 'o':
-               if (**start == '\0')
-                       return ESCAPE_ERROR;
-               if (gly == ESCAPE_ERROR)
-                       gly = ESCAPE_OVERSTRIKE;
-               term = **start;
-               *start = ++*end;
-               break;
-
-       /*
-        * These escapes are of the form \X'N', where 'X' is the trigger
-        * and 'N' resolves to a numerical expression.
-        */
-       case 'h':
-       case 'H':
-       case 'L':
-       case 'l':
-       case 'S':
-       case 'v':
-       case 'x':
-               if (strchr(" %&()*+-./0123456789:<=>", **start)) {
-                       if ('\0' != **start)
-                               ++*end;
-                       return ESCAPE_ERROR;
-               }
-               switch ((*start)[-1]) {
-               case 'h':
-                       gly = ESCAPE_HORIZ;
-                       break;
-               case 'l':
-                       gly = ESCAPE_HLINE;
-                       break;
-               default:
-                       gly = ESCAPE_IGNORE;
-                       break;
-               }
-               term = **start;
-               *start = ++*end;
-               break;
-
-       /*
-        * Special handling for the numbered character escape.
-        * XXX Do any other escapes need similar handling?
-        */
-       case 'N':
-               if ('\0' == **start)
-                       return ESCAPE_ERROR;
-               (*end)++;
-               if (isdigit((unsigned char)**start)) {
-                       *sz = 1;
-                       return ESCAPE_IGNORE;
-               }
-               (*start)++;
-               while (isdigit((unsigned char)**end))
-                       (*end)++;
-               *sz = *end - *start;
-               if ('\0' != **end)
-                       (*end)++;
-               return ESCAPE_NUMBERED;
-
-       /*
-        * Sizes get a special category of their own.
-        */
-       case 's':
-               gly = ESCAPE_IGNORE;
-
-               /* See +/- counts as a sign. */
-               if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
-                       *start = ++*end;
-
-               switch (**end) {
-               case '(':
-                       *start = ++*end;
-                       *sz = 2;
-                       break;
-               case '[':
-                       *start = ++*end;
-                       term = ']';
-                       break;
-               case '\'':
-                       *start = ++*end;
-                       term = '\'';
-                       break;
-               case '3':
-               case '2':
-               case '1':
-                       *sz = (*end)[-1] == 's' &&
-                           isdigit((unsigned char)(*end)[1]) ? 2 : 1;
-                       break;
-               default:
-                       *sz = 1;
-                       break;
-               }
-
-               break;
-
-       /*
-        * Several special characters can be encoded as
-        * one-byte escape sequences without using \[].
-        */
-       case ' ':
-       case '\'':
-       case '-':
-       case '.':
-       case '0':
-       case ':':
-       case '_':
-       case '`':
-       case 'e':
-       case '~':
-               gly = ESCAPE_SPECIAL;
-               /* FALLTHROUGH */
-       default:
-               if (gly == ESCAPE_ERROR)
-                       gly = ESCAPE_UNDEF;
-               *start = --*end;
-               *sz = 1;
-               break;
-       }
-
-       /*
-        * Read up to the terminating character,
-        * paying attention to nested escapes.
-        */
-
-       if ('\0' != term) {
-               while (**end != term) {
-                       switch (**end) {
-                       case '\0':
-                               return ESCAPE_ERROR;
-                       case '\\':
-                               (*end)++;
-                               if (ESCAPE_ERROR ==
-                                   mandoc_escape(end, NULL, NULL))
-                                       return ESCAPE_ERROR;
-                               break;
-                       default:
-                               (*end)++;
-                               break;
-                       }
-               }
-               *sz = (*end)++ - *start;
-
-               /*
-                * The file chars.c only provides one common list
-                * of character names, but \[-] == \- is the only
-                * one of the characters with one-byte names that
-                * allows enclosing the name in brackets.
-                */
-               if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
-                       return ESCAPE_ERROR;
-       } else {
-               assert(*sz > 0);
-               if ((size_t)*sz > strlen(*start))
-                       return ESCAPE_ERROR;
-               *end += *sz;
-       }
-
-       /* Run post-processors. */
-
-       switch (gly) {
-       case ESCAPE_FONT:
-               gly = mandoc_font(*start, *sz);
-               break;
-       case ESCAPE_SPECIAL:
-               if (**start == 'c') {
-                       if (*sz < 6 || *sz > 7 ||
-                           strncmp(*start, "char", 4) != 0 ||
-                           (int)strspn(*start + 4, "0123456789") + 4 < *sz)
-                               break;
-                       c = 0;
-                       for (i = 4; i < *sz; i++)
-                               c = 10 * c + ((*start)[i] - '0');
-                       if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
-                               break;
-                       *start += 4;
-                       *sz -= 4;
-                       gly = ESCAPE_NUMBERED;
-                       break;
-               }
-
-               /*
-                * Unicode escapes are defined in groff as \[u0000]
-                * to \[u10FFFF], where the contained value must be
-                * a valid Unicode codepoint.  Here, however, only
-                * check the length and range.
-                */
-               if (**start != 'u' || *sz < 5 || *sz > 7)
-                       break;
-               if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
-                       break;
-               if (*sz == 6 && (*start)[1] == '0')
-                       break;
-               if (*sz == 5 && (*start)[1] == 'D' &&
-                   strchr("89ABCDEF", (*start)[2]) != NULL)
-                       break;
-               if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
-                   + 1 == *sz)
-                       gly = ESCAPE_UNICODE;
-               break;
-       case ESCAPE_DEVICE:
-               assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T');
-               break;
-       default:
-               break;
-       }
-
-       return gly;
-}
-
 static int
 a2time(time_t *t, const char *fmt, const char *p)
 {
index 717b334..6a53a29 100644 (file)
@@ -1,4 +1,4 @@
-/* $OpenBSD: mandoc.h,v 1.218 2022/04/28 16:16:46 schwarze Exp $ */
+/* $OpenBSD: mandoc.h,v 1.219 2022/05/19 15:17:50 schwarze Exp $ */
 /*
  * Copyright (c) 2012-2022 Ingo Schwarze <schwarze@openbsd.org>
  * Copyright (c) 2010, 2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
@@ -285,11 +285,12 @@ enum      mandocerr {
 };
 
 enum   mandoc_esc {
-       ESCAPE_ERROR = 0, /* bail! unparsable escape */
-       ESCAPE_UNSUPP, /* unsupported escape; ignore it */
-       ESCAPE_IGNORE, /* escape to be ignored */
-       ESCAPE_UNDEF, /* undefined escape; print literal character */
-       ESCAPE_SPECIAL, /* a regular special character */
+       ESCAPE_EXPAND = 0, /* interpolation and iterative call needed */
+       ESCAPE_ERROR, /* non-fatal error: unparsable escape */
+       ESCAPE_UNSUPP, /* unsupported escape: warn and ignore */
+       ESCAPE_IGNORE, /* valid escape to be ignored */
+       ESCAPE_UNDEF, /* undefined escape: print literal character */
+       ESCAPE_SPECIAL, /* special character escape */
        ESCAPE_FONT, /* a generic font mode */
        ESCAPE_FONTBOLD, /* bold font mode */
        ESCAPE_FONTITALIC, /* italic font mode */
index 4a93d34..783eba5 100644 (file)
@@ -1,4 +1,4 @@
-/* $OpenBSD: roff.c,v 1.259 2022/05/01 16:18:59 schwarze Exp $ */
+/* $OpenBSD: roff.c,v 1.260 2022/05/19 15:17:51 schwarze Exp $ */
 /*
  * Copyright (c) 2010-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
  * Copyright (c) 2008-2012, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
@@ -205,6 +205,8 @@ static      int              roff_evalpar(struct roff *, int,
 static int              roff_evalstrcond(const char *, int *);
 static int              roff_expand(struct roff *, struct buf *,
                                int, int, char);
+static void             roff_expand_patch(struct buf *, int,
+                               const char *, int);
 static void             roff_free1(struct roff *);
 static void             roff_freereg(struct roffreg *);
 static void             roff_freestr(struct roffkv *);
@@ -1231,9 +1233,15 @@ deroff(char **dest, const struct roff_node *n)
 
 /* --- main functions of the roff parser ---------------------------------- */
 
+/*
+ * Save comments preceding the title macro, for example in order to
+ * preserve Copyright and license headers in HTML output,
+ * provide diagnostics about RCS ids and trailing whitespace in comments,
+ * then discard comments including preceding whitespace.
+ * This function also handles input line continuation.
+ */
 static int
-roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
-    char newesc)
+roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos, char ec)
 {
        struct roff_node *n;    /* used for header comments */
        const char      *start; /* start of the string to process */
@@ -1243,15 +1251,39 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
        int              rcsid; /* kind of RCS id seen */
 
        for (start = stesc = buf->buf + pos;; stesc++) {
+               /*
+                * XXX Ugly hack: Remove the newline character that
+                * mparse_buf_r() appended to mark the end of input
+                * if it is not preceded by an escape character.
+                */
+               if (stesc[0] == '\n') {
+                       assert(stesc[1] == '\0');
+                       stesc[0] = '\0';
+               }
+
                /* The line ends without continuation or comment. */
                if (stesc[0] == '\0')
                        return ROFF_CONT;
 
                /* Unescaped byte: skip it. */
-               if (stesc[0] != newesc)
+               if (stesc[0] != ec)
                        continue;
 
-               /* Backslash at end of line requests line continuation. */
+               /*
+                * XXX Ugly hack: Do not attempt to append another line
+                * if the function mparse_buf_r() appended a newline
+                * character to indicate the end of input.
+                */
+               if (stesc[1] == '\n') {
+                       assert(stesc[2] == '\0');
+                       stesc[0] = '\0';
+                       return ROFF_CONT;
+               }
+
+               /*
+                * An escape character at the end of an input line
+                * requests line continuation.
+                */
                if (stesc[1] == '\0') {
                        stesc[0] = '\0';
                        return ROFF_IGN | ROFF_APPEND;
@@ -1262,7 +1294,7 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
                        break;
 
                /* Escaped escape character: skip them both. */
-               if (stesc[1] == newesc)
+               if (stesc[1] == ec)
                        stesc++;
        }
 
@@ -1329,324 +1361,217 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
  * which typically produce output glyphs or change formatter state.
  */
 static int
-roff_expand(struct roff *r, struct buf *buf, int ln, int pos, char newesc)
+roff_expand(struct roff *r, struct buf *buf, int ln, int pos, char ec)
 {
-       struct mctx     *ctx;   /* current macro call context */
-       char             ubuf[24]; /* buffer to print the number */
-       const char      *start; /* start of the string to process */
-       char            *stesc; /* start of an escape sequence ('\\') */
-       const char      *esct;  /* type of esccape sequence */
-       const char      *stnam; /* start of the name, after "[(*" */
-       const char      *cp;    /* end of the name, e.g. before ']' */
-       const char      *res;   /* the string to be substituted */
-       char            *nbuf;  /* new buffer to copy buf->buf to */
-       size_t           maxl;  /* expected length of the escape name */
-       size_t           naml;  /* actual length of the escape name */
-       size_t           asz;   /* length of the replacement */
-       size_t           rsz;   /* length of the rest of the string */
-       int              inaml; /* length returned from mandoc_escape() */
+       char             ubuf[24];      /* buffer to print a number */
+       struct mctx     *ctx;           /* current macro call context */
+       const char      *res;           /* the string to be pasted */
+       const char      *src;           /* source for copying */
+       char            *dst;           /* destination for copying */
+       int              iesc;          /* index of leading escape char */
+       int              inam;          /* index of the escape name */
+       int              iarg;          /* index beginning the argument */
+       int              iendarg;       /* index right after the argument */
+       int              iend;          /* index right after the sequence */
+       int              deftype;       /* type of definition to paste */
+       int              argi;          /* macro argument index */
+       int              quote_args;    /* true for \\$@, false for \\$* */
+       int              asz;           /* length of the replacement */
+       int              rsz;           /* length of the rest of the string */
+       int              npos;          /* position in numeric expression */
        int              expand_count;  /* to avoid infinite loops */
-       int              npos;  /* position in numeric expression */
-       int              arg_complete; /* argument not interrupted by eol */
-       int              quote_args; /* true for \\$@, false for \\$* */
-       int              deftype; /* type of definition to paste */
-       enum mandocerr   err;   /* for escape sequence problems */
-       char             sign;  /* increment number register */
-       char             term;  /* character terminating the escape */
-
-       start = buf->buf + pos;
-       stesc = strchr(start, '\0') - 1;
-       if (stesc >= start && *stesc == '\n')
-               *stesc-- = '\0';
 
        expand_count = 0;
-       while (stesc >= start) {
-               if (*stesc != newesc) {
+       while (buf->buf[pos] != '\0') {
 
-                       /*
-                        * If we have a non-standard escape character,
-                        * escape literal backslashes because all
-                        * processing in subsequent functions uses
-                        * the standard escaping rules.
-                        */
+               /*
+                * Skip plain ASCII characters.
+                * If we have a non-standard escape character,
+                * escape literal backslashes because all processing in
+                * subsequent functions uses the standard escaping rules.
+                */
 
-                       if (newesc != ASCII_ESC && *stesc == '\\') {
-                               *stesc = '\0';
-                               buf->sz = mandoc_asprintf(&nbuf, "%s\\e%s",
-                                   buf->buf, stesc + 1) + 1;
-                               start = nbuf + pos;
-                               stesc = nbuf + (stesc - buf->buf);
-                               free(buf->buf);
-                               buf->buf = nbuf;
+               if (buf->buf[pos] != ec) {
+                       if (ec != ASCII_ESC && buf->buf[pos] == '\\') {
+                               roff_expand_patch(buf, pos, "\\e", pos + 1);
+                               pos++;
                        }
-
-                       /* Search backwards for the next escape. */
-
-                       stesc--;
+                       pos++;
                        continue;
                }
 
-               /* If it is escaped, skip it. */
-
-               for (cp = stesc - 1; cp >= start; cp--)
-                       if (*cp != r->escape)
-                               break;
-
-               if ((stesc - cp) % 2 == 0) {
-                       while (stesc > cp)
-                               *stesc-- = '\\';
-                       continue;
-               } else if (stesc[1] == '\0') {
-                       *stesc-- = '\0';
-                       continue;
-               } else
-                       *stesc = '\\';
-
-               /* Decide whether to expand or to check only. */
+               /*
+                * Parse escape sequences,
+                * issue diagnostic messages when appropriate,
+                * and skip sequences that do not need expansion.
+                * If we have a non-standard escape character, translate
+                * it to backslashes and translate backslashes to \e.
+                */
 
-               term = '\0';
-               cp = stesc + 1;
-               while (*cp == 'E')
-                       cp++;
-               esct = cp;
-               switch (*esct) {
-               case '*':
-               case '$':
-                       res = NULL;
-                       break;
-               case 'B':
-               case 'w':
-                       term = cp[1];
-                       /* FALLTHROUGH */
-               case 'n':
-                       sign = cp[1];
-                       if (sign == '+' || sign == '-')
-                               cp++;
-                       res = ubuf;
-                       break;
-               default:
-                       err = MANDOCERR_OK;
-                       switch(mandoc_escape(&cp, &stnam, &inaml)) {
-                       case ESCAPE_SPECIAL:
-                               if (mchars_spec2cp(stnam, inaml) >= 0)
-                                       break;
-                               /* FALLTHROUGH */
-                       case ESCAPE_ERROR:
-                               err = MANDOCERR_ESC_BAD;
-                               break;
-                       case ESCAPE_UNDEF:
-                               err = MANDOCERR_ESC_UNDEF;
-                               break;
-                       case ESCAPE_UNSUPP:
-                               err = MANDOCERR_ESC_UNSUPP;
-                               break;
-                       default:
-                               break;
+               if (roff_escape(buf->buf, ln, pos,
+                   &iesc, &iarg, &iendarg, &iend) != ESCAPE_EXPAND) {
+                       while (pos < iend) {
+                               if (buf->buf[pos] == ec) {
+                                       buf->buf[pos] = '\\';
+                                       if (pos + 1 < iend)
+                                               pos++;
+                               } else if (buf->buf[pos] == '\\') {
+                                       roff_expand_patch(buf,
+                                           pos, "\\e", pos + 1);
+                                       pos++;
+                                       iend++;
+                               }
+                               pos++;
                        }
-                       if (err != MANDOCERR_OK)
-                               mandoc_msg(err, ln, (int)(stesc - buf->buf),
-                                   "%.*s", (int)(cp - stesc), stesc);
-                       stesc--;
                        continue;
                }
 
-               if (EXPAND_LIMIT < ++expand_count) {
-                       mandoc_msg(MANDOCERR_ROFFLOOP,
-                           ln, (int)(stesc - buf->buf), NULL);
-                       return ROFF_IGN;
-               }
-
                /*
-                * The third character decides the length
-                * of the name of the string or register.
-                * Save a pointer to the name.
+                * Treat "\E" just like "\";
+                * it only makes a difference in copy mode.
                 */
 
-               if (term == '\0') {
-                       switch (*++cp) {
-                       case '\0':
-                               maxl = 0;
-                               break;
-                       case '(':
-                               cp++;
-                               maxl = 2;
-                               break;
-                       case '[':
-                               cp++;
-                               term = ']';
-                               maxl = 0;
-                               break;
-                       default:
-                               maxl = 1;
-                               break;
-                       }
-               } else {
-                       cp += 2;
-                       maxl = 0;
-               }
-               stnam = cp;
+               inam = iesc + 1;
+               while (buf->buf[inam] == 'E')
+                       inam++;
 
-               /* Advance to the end of the name. */
+               /* Handle expansion. */
 
-               naml = 0;
-               arg_complete = 1;
-               while (maxl == 0 || naml < maxl) {
-                       if (*cp == '\0') {
-                               mandoc_msg(MANDOCERR_ESC_BAD, ln,
-                                   (int)(stesc - buf->buf), "%s", stesc);
-                               arg_complete = 0;
-                               break;
-                       }
-                       if (maxl == 0 && *cp == term) {
-                               cp++;
-                               break;
-                       }
-                       if (*cp++ != '\\' || *esct != 'w') {
-                               naml++;
-                               continue;
-                       }
-                       switch (mandoc_escape(&cp, NULL, NULL)) {
-                       case ESCAPE_SPECIAL:
-                       case ESCAPE_UNICODE:
-                       case ESCAPE_NUMBERED:
-                       case ESCAPE_UNDEF:
-                       case ESCAPE_OVERSTRIKE:
-                               naml++;
+               res = NULL;
+               switch (buf->buf[inam]) {
+               case '*':
+                       if (iendarg == iarg)
                                break;
-                       default:
+                       deftype = ROFFDEF_USER | ROFFDEF_PRE;
+                       if ((res = roff_getstrn(r, buf->buf + iarg,
+                           iendarg - iarg, &deftype)) != NULL)
                                break;
-                       }
-               }
 
-               /*
-                * Retrieve the replacement string; if it is
-                * undefined, resume searching for escapes.
-                */
+                       /*
+                        * If not overriden,
+                        * let \*(.T through to the formatters.
+                        */
 
-               switch (*esct) {
-               case '*':
-                       if (arg_complete) {
-                               deftype = ROFFDEF_USER | ROFFDEF_PRE;
-                               res = roff_getstrn(r, stnam, naml, &deftype);
-
-                               /*
-                                * If not overriden, let \*(.T
-                                * through to the formatters.
-                                */
-
-                               if (res == NULL && naml == 2 &&
-                                   stnam[0] == '.' && stnam[1] == 'T') {
-                                       roff_setstrn(&r->strtab,
-                                           ".T", 2, NULL, 0, 0);
-                                       stesc--;
-                                       continue;
-                               }
+                       if (iendarg - iarg == 2 &&
+                           buf->buf[iarg] == '.' &&
+                           buf->buf[iarg + 1] == 'T') {
+                               roff_setstrn(&r->strtab, ".T", 2, NULL, 0, 0);
+                               pos = iend;
+                               continue;
                        }
+
+                       mandoc_msg(MANDOCERR_STR_UNDEF, ln, iesc,
+                           "%.*s", iendarg - iarg, buf->buf + iarg);
                        break;
+
                case '$':
                        if (r->mstackpos < 0) {
-                               mandoc_msg(MANDOCERR_ARG_UNDEF, ln,
-                                   (int)(stesc - buf->buf), "%.3s", stesc);
+                               mandoc_msg(MANDOCERR_ARG_UNDEF, ln, iesc,
+                                   "%.*s", iend - iesc, buf->buf + iesc);
                                break;
                        }
                        ctx = r->mstack + r->mstackpos;
-                       npos = esct[1] - '1';
-                       if (npos >= 0 && npos <= 8) {
-                               res = npos < ctx->argc ?
-                                   ctx->argv[npos] : "";
+                       argi = buf->buf[iarg] - '1';
+                       if (argi >= 0 && argi <= 8) {
+                               if (argi < ctx->argc)
+                                       res = ctx->argv[argi];
                                break;
                        }
-                       if (esct[1] == '*')
+                       if (buf->buf[iarg] == '*')
                                quote_args = 0;
-                       else if (esct[1] == '@')
+                       else if (buf->buf[iarg] == '@')
                                quote_args = 1;
                        else {
-                               mandoc_msg(MANDOCERR_ARG_NONUM, ln,
-                                   (int)(stesc - buf->buf), "%.3s", stesc);
+                               mandoc_msg(MANDOCERR_ARG_NONUM, ln, iesc,
+                                   "%.*s", iend - iesc, buf->buf + iesc);
                                break;
                        }
                        asz = 0;
-                       for (npos = 0; npos < ctx->argc; npos++) {
-                               if (npos)
+                       for (argi = 0; argi < ctx->argc; argi++) {
+                               if (argi)
                                        asz++;  /* blank */
                                if (quote_args)
                                        asz += 2;  /* quotes */
-                               asz += strlen(ctx->argv[npos]);
+                               asz += strlen(ctx->argv[argi]);
                        }
-                       if (asz != 3) {
-                               rsz = buf->sz - (stesc - buf->buf) - 3;
-                               if (asz < 3)
-                                       memmove(stesc + asz, stesc + 3, rsz);
-                               buf->sz += asz - 3;
-                               nbuf = mandoc_realloc(buf->buf, buf->sz);
-                               start = nbuf + pos;
-                               stesc = nbuf + (stesc - buf->buf);
-                               buf->buf = nbuf;
-                               if (asz > 3)
-                                       memmove(stesc + asz, stesc + 3, rsz);
+                       if (asz != iend - iesc) {
+                               rsz = buf->sz - iend;
+                               if (asz < iend - iesc)
+                                       memmove(buf->buf + iesc + asz,
+                                           buf->buf + iend, rsz);
+                               buf->sz = iesc + asz + rsz;
+                               buf->buf = mandoc_realloc(buf->buf, buf->sz);
+                               if (asz > iend - iesc)
+                                       memmove(buf->buf + iesc + asz,
+                                           buf->buf + iend, rsz);
                        }
-                       for (npos = 0; npos < ctx->argc; npos++) {
-                               if (npos)
-                                       *stesc++ = ' ';
+                       dst = buf->buf + iesc;
+                       for (argi = 0; argi < ctx->argc; argi++) {
+                               if (argi)
+                                       *dst++ = ' ';
                                if (quote_args)
-                                       *stesc++ = '"';
-                               cp = ctx->argv[npos];
-                               while (*cp != '\0')
-                                       *stesc++ = *cp++;
+                                       *dst++ = '"';
+                               src = ctx->argv[argi];
+                               while (*src != '\0')
+                                       *dst++ = *src++;
                                if (quote_args)
-                                       *stesc++ = '"';
+                                       *dst++ = '"';
                        }
                        continue;
                case 'B':
                        npos = 0;
-                       ubuf[0] = arg_complete &&
-                           roff_evalnum(r, ln, stnam, &npos,
-                             NULL, ROFFNUM_SCALE) &&
-                           stnam + npos + 1 == cp ? '1' : '0';
+                       ubuf[0] = iendarg > iarg && iend > iendarg &&
+                           roff_evalnum(r, ln, buf->buf + iarg, &npos,
+                                        NULL, ROFFNUM_SCALE) &&
+                           npos == iendarg - iarg ? '1' : '0';
                        ubuf[1] = '\0';
+                       res = ubuf;
                        break;
                case 'n':
-                       if (arg_complete)
+                       if (iendarg > iarg)
                                (void)snprintf(ubuf, sizeof(ubuf), "%d",
-                                   roff_getregn(r, stnam, naml, sign));
+                                   roff_getregn(r, buf->buf + iarg,
+                                   iendarg - iarg, buf->buf[inam + 1]));
                        else
                                ubuf[0] = '\0';
+                       res = ubuf;
                        break;
                case 'w':
-                       /* use even incomplete args */
-                       (void)snprintf(ubuf, sizeof(ubuf), "%d",
-                           24 * (int)naml);
+                       (void)snprintf(ubuf, sizeof(ubuf),
+                           "%d", (iendarg - iarg) * 24);
+                       res = ubuf;
+                       break;
+               default:
                        break;
                }
-
-               if (res == NULL) {
-                       if (*esct == '*')
-                               mandoc_msg(MANDOCERR_STR_UNDEF,
-                                   ln, (int)(stesc - buf->buf),
-                                   "%.*s", (int)naml, stnam);
+               if (res == NULL)
                        res = "";
-               } else if (buf->sz + strlen(res) > SHRT_MAX) {
-                       mandoc_msg(MANDOCERR_ROFFLOOP,
-                           ln, (int)(stesc - buf->buf), NULL);
+               if (++expand_count > EXPAND_LIMIT ||
+                   buf->sz + strlen(res) > SHRT_MAX) {
+                       mandoc_msg(MANDOCERR_ROFFLOOP, ln, iesc, NULL);
                        return ROFF_IGN;
                }
-
-               /* Replace the escape sequence by the string. */
-
-               *stesc = '\0';
-               buf->sz = mandoc_asprintf(&nbuf, "%s%s%s",
-                   buf->buf, res, cp) + 1;
-
-               /* Prepare for the next replacement. */
-
-               start = nbuf + pos;
-               stesc = nbuf + (stesc - buf->buf) + strlen(res);
-               free(buf->buf);
-               buf->buf = nbuf;
+               roff_expand_patch(buf, iesc, res, iend);
        }
        return ROFF_CONT;
 }
 
+/*
+ * Replace the substring from the start position (inclusive)
+ * to end position (exclusive) with the repl(acement) string.
+ */
+static void
+roff_expand_patch(struct buf *buf, int start, const char *repl, int end)
+{
+       char    *nbuf;
+
+       buf->buf[start] = '\0';
+       buf->sz = mandoc_asprintf(&nbuf, "%s%s%s", buf->buf, repl,
+           buf->buf + end) + 1;
+       free(buf->buf);
+       buf->buf = nbuf;
+}
+
 /*
  * Parse a quoted or unquoted roff-style request or macro argument.
  * Return a pointer to the parsed argument, which is either the original
diff --git a/usr.bin/mandoc/roff_escape.c b/usr.bin/mandoc/roff_escape.c
new file mode 100644 (file)
index 0000000..4571873
--- /dev/null
@@ -0,0 +1,477 @@
+/* $OpenBSD: roff_escape.c,v 1.1 2022/05/19 15:17:51 schwarze Exp $ */
+/*
+ * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
+ *               Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Parser for roff(7) escape sequences.
+ * To be used by all mandoc(1) parsers and formatters.
+ */
+#include <assert.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "mandoc.h"
+#include "roff.h"
+#include "roff_int.h"
+
+/*
+ * Traditional escape sequence interpreter for general use
+ * including in high-level formatters.  This function does not issue
+ * diagnostics and is not usable for expansion in the roff(7) parser.
+ * It is documented in the mandoc_escape(3) manual page.
+ */
+enum mandoc_esc
+mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
+{
+        int             iarg, iendarg, iend;
+        enum mandoc_esc  rval;
+
+        rval = roff_escape(--*rendarg, 0, 0, NULL, &iarg, &iendarg, &iend);
+        assert(rval != ESCAPE_EXPAND);
+        if (rarg != NULL)
+              *rarg = *rendarg + iarg;
+        if (rargl != NULL)
+              *rargl = iendarg - iarg;
+        *rendarg += iend;
+        return rval;
+}
+
+/*
+ * Full-featured escape sequence parser.
+ * If it encounters a nested escape sequence that requires expansion
+ * by the parser and re-parsing, the positions of that inner escape
+ * sequence are returned in *resc ... *rend.
+ * Otherwise, *resc is set to aesc and the positions of the escape
+ * sequence starting at aesc are returned.
+ * Diagnostic messages are generated if and only if resc != NULL,
+ * that is, if and only if called by roff_expand().
+ */
+enum mandoc_esc
+roff_escape(const char *buf, const int ln, const int aesc,
+    int *resc, int *rarg, int *rendarg, int *rend)
+{
+       int              iesc;          /* index of leading escape char */
+       int              iarg;          /* index beginning the argument */
+       int              iendarg;       /* index right after the argument */
+       int              iend;          /* index right after the sequence */
+       int              sesc, sarg, sendarg, send; /* for sub-escape */
+       int              maxl;          /* expected length of the argument */
+       int              argl;          /* actual length of the argument */
+       int              c, i;          /* for \[char...] parsing */
+       enum mandoc_esc  rval;          /* return value */
+       enum mandocerr   err;           /* diagnostic code */
+       char             esc_name;
+       char             term;          /* byte terminating the argument */
+
+       /*
+        * Treat "\E" just like "\";
+        * it only makes a difference in copy mode.
+        */
+
+       iesc = iarg = aesc;
+       do {
+               iarg++;
+       } while (buf[iarg] == 'E');
+
+       /*
+        * Sort the following cases first by syntax category,
+        * then by escape sequence type, and finally by ASCII code.
+        */
+
+       esc_name = buf[iarg];
+       iendarg = iend = ++iarg;
+       maxl = INT_MAX;
+       term = '\0';
+       switch (esc_name) {
+
+       /* Escape sequences taking no arguments at all. */
+
+       case '!':
+       case '?':
+               rval = ESCAPE_UNSUPP;
+               goto out;
+
+       case '%':
+       case '&':
+       case ')':
+       case ',':
+       case '/':
+       case '^':
+       case 'a':
+       case 'd':
+       case 'r':
+       case 't':
+       case 'u':
+       case '{':
+       case '|':
+       case '}':
+               rval = ESCAPE_IGNORE;
+               goto out;
+
+       case '\\':
+       default:
+               iarg--;
+               rval = ESCAPE_UNDEF;
+               goto out;
+
+       case ' ':
+       case '\'':
+       case '-':
+       case '.':
+       case '0':
+       case ':':
+       case '_':
+       case '`':
+       case 'e':
+       case '~':
+               iarg--;
+               argl = 1;
+               rval = ESCAPE_SPECIAL;
+               goto out;
+       case 'p':
+               rval = ESCAPE_BREAK;
+               goto out;
+       case 'c':
+               rval = ESCAPE_NOSPACE;
+               goto out;
+       case 'z':
+               rval = ESCAPE_SKIPCHAR;
+               goto out;
+
+       /* Standard argument format. */
+
+       case '$':
+       case '*':
+       case 'n':
+               rval = ESCAPE_EXPAND;
+               break;
+       case 'F':
+       case 'M':
+       case 'O':
+       case 'V':
+       case 'Y':
+       case 'g':
+       case 'k':
+       case 'm':
+               rval = ESCAPE_IGNORE;
+               break;
+       case '(':
+       case '[':
+               rval = ESCAPE_SPECIAL;
+               iendarg = iend = --iarg;
+               break;
+       case 'f':
+               rval = ESCAPE_FONT;
+               break;
+
+       /* Quoted arguments */
+
+       case 'B':
+       case 'w':
+               rval = ESCAPE_EXPAND;
+               term = '\b';
+               break;
+       case 'A':
+       case 'D':
+       case 'H':
+       case 'L':
+       case 'R':
+       case 'S':
+       case 'X':
+       case 'Z':
+       case 'b':
+       case 'v':
+       case 'x':
+               rval = ESCAPE_IGNORE;
+               term = '\b';
+               break;
+       case 'C':
+               if (buf[iarg] != '\'') {
+                       rval = ESCAPE_ERROR;
+                       goto out;
+               }
+               rval = ESCAPE_SPECIAL;
+               term = '\b';
+               break;
+       case 'N':
+               rval = ESCAPE_NUMBERED;
+               term = '\b';
+               break;
+       case 'h':
+               rval = ESCAPE_HORIZ;
+               term = '\b';
+               break;
+       case 'l':
+               rval = ESCAPE_HLINE;
+               term = '\b';
+               break;
+       case 'o':
+               rval = ESCAPE_OVERSTRIKE;
+               term = '\b';
+               break;
+
+       /* Sizes support both forms, with additional peculiarities. */
+
+       case 's':
+               rval = ESCAPE_IGNORE;
+               if (buf[iarg] == '+' || buf[iarg] == '-'||
+                   buf[iarg] == ASCII_HYPH)
+                       iarg++;
+               switch (buf[iarg]) {
+               case '(':
+                       maxl = 2;
+                       iarg++;
+                       break;
+               case '[':
+                       term = ']';
+                       iarg++;
+                       break;
+               case '\'':
+                       term = '\'';
+                       iarg++;
+                       break;
+               case '1':
+               case '2':
+               case '3':
+                       if (buf[iarg - 1] == 's' &&
+                           isdigit((unsigned char)buf[iarg + 1])) {
+                               maxl = 2;
+                               break;
+                       }
+                       /* FALLTHROUGH */
+               default:
+                       maxl = 1;
+                       break;
+               }
+               iendarg = iend = iarg;
+       }
+
+       /* Decide how to end the argument. */
+
+       if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
+           buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
+           &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
+               goto out_sub;
+
+       if (term == '\b') {
+               if ((esc_name == 'N' && isdigit((unsigned char)buf[iarg])) ||
+                   (esc_name == 'h' && strchr(" %&()*+-./0123456789:<=>",
+                    buf[iarg]) != NULL)) {
+                       iendarg = iend = iarg + 1;
+                       rval = ESCAPE_ERROR;
+                       goto out;
+               }
+               term = buf[iarg++];
+       } else if (term == '\0' && maxl == INT_MAX) {
+               if (esc_name == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
+                       iarg++;
+               switch (buf[iarg]) {
+               case '(':
+                       maxl = 2;
+                       iarg++;
+                       break;
+               case '[':
+                       if (buf[++iarg] == ' ') {
+                               iendarg = iend = iarg + 1;
+                               rval = ESCAPE_ERROR;
+                               goto out;
+                       }
+                       term = ']';
+                       break;
+               default:
+                       maxl = 1;
+                       break;
+               }
+       }
+
+       /* Advance to the end of the argument. */
+
+       iendarg = iarg;
+       while (maxl > 0) {
+               if (buf[iendarg] == '\0') {
+                       /* Ignore an incomplete argument except for \w. */
+                       if (esc_name != 'w')
+                               iendarg = iarg;
+                       break;
+               }
+               if (buf[iendarg] == term) {
+                       iend = iendarg + 1;
+                       break;
+               }
+               if (esc_name == 'N' &&
+                   isdigit((unsigned char)buf[iendarg]) == 0) {
+                       iend = iendarg + 1;
+                       break;
+               }
+               if (buf[iendarg] == buf[iesc]) {
+                       if (roff_escape(buf, ln, iendarg,
+                           &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
+                               goto out_sub;
+                       iendarg = iend = send;
+               } else {
+                       if (maxl != INT_MAX)
+                               maxl--;
+                       iend = ++iendarg;
+               }
+       }
+       if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
+           (term != '\0' && buf[iendarg] != term)))
+               mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);
+
+       /* Post-process depending on the content of the argument. */
+
+       argl = iendarg - iarg;
+       switch (esc_name) {
+       case '*':
+               if (resc == NULL && argl == 2 &&
+                   buf[iarg] == '.' && buf[iarg + 1] == 'T')
+                       rval = ESCAPE_DEVICE;
+               break;
+       case 'O':
+               switch (buf[iarg]) {
+               case '0':
+                       rval = ESCAPE_UNSUPP;
+                       break;
+               case '1':
+               case '2':
+               case '3':
+               case '4':
+                       rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
+                       break;
+               case '5':
+                       rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
+                           ESCAPE_ERROR;
+                       break;
+               default:
+                       rval = ESCAPE_ERROR;
+                       break;
+               }
+               break;
+       default:
+               break;
+       }
+
+       switch (rval) {
+       case ESCAPE_FONT:
+               rval = mandoc_font(buf + iarg, argl);
+               break;
+
+       case ESCAPE_SPECIAL:
+
+               /*
+                * The file chars.c only provides one common list of
+                * character names, but \[-] == \- is the only one of
+                * the characters with one-byte names that allows
+                * enclosing the name in brackets.
+                */
+
+               if (term != '\0' && argl == 1 && buf[iarg] != '-') {
+                       rval = ESCAPE_ERROR;
+                       break;
+               }
+
+               /* Treat \[char...] as an alias for \N'...'. */
+
+               if (buf[iarg] == 'c') {
+                       if (argl < 6 || argl > 7 ||
+                           strncmp(buf + iarg, "char", 4) != 0 ||
+                           (int)strspn(buf + iarg + 4, "0123456789")
+                            + 4 < argl)
+                               break;
+                       c = 0;
+                       for (i = iarg; i < iendarg; i++)
+                               c = 10 * c + (buf[i] - '0');
+                       if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
+                               break;
+                       iarg += 4;
+                       rval = ESCAPE_NUMBERED;
+                       break;
+               }
+
+               /*
+                * Unicode escapes are defined in groff as \[u0000]
+                * to \[u10FFFF], where the contained value must be
+                * a valid Unicode codepoint.  Here, however, only
+                * check the length and range.
+                */
+
+               if (buf[iarg] != 'u' || argl < 5 || argl > 7)
+                       break;
+               if (argl == 7 &&
+                   (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
+                       break;
+               if (argl == 6 && buf[iarg + 1] == '0')
+                       break;
+               if (argl == 5 && buf[iarg + 1] == 'D' &&
+                   strchr("89ABCDEF", buf[iarg + 2]) != NULL)
+                       break;
+               if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
+                   + 1 == argl)
+                       rval = ESCAPE_UNICODE;
+               break;
+       default:
+               break;
+       }
+       goto out;
+
+out_sub:
+       iesc = sesc;
+       iarg = sarg;
+       iendarg = sendarg;
+       iend = send;
+       rval = ESCAPE_EXPAND;
+
+out:
+       if (rarg != NULL)
+               *rarg = iarg;
+       if (rendarg != NULL)
+               *rendarg = iendarg;
+       if (rend != NULL)
+               *rend = iend;
+       if (resc == NULL)
+               return rval;
+
+       /*
+        * Diagnostic messages are only issued when called
+        * from the parser, not when called from the formatters.
+        */
+
+       *resc = iesc;
+       switch (rval) {
+       case ESCAPE_ERROR:
+               err = MANDOCERR_ESC_BAD;
+               break;
+       case ESCAPE_UNSUPP:
+               err = MANDOCERR_ESC_UNSUPP;
+               break;
+       case ESCAPE_UNDEF:
+               if (esc_name == '\\')
+                       return rval;
+               err = MANDOCERR_ESC_UNDEF;
+               break;
+       case ESCAPE_SPECIAL:
+               if (mchars_spec2cp(buf + iarg, argl) >= 0)
+                       return rval;
+               err = MANDOCERR_ESC_BAD;
+               break;
+       default:
+               return rval;
+       }
+       mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
+       return rval;
+}
index c2a9246..7bbf4dd 100644 (file)
@@ -1,6 +1,6 @@
-/* $OpenBSD: roff_int.h,v 1.18 2021/10/04 14:18:42 schwarze Exp $      */
+/* $OpenBSD: roff_int.h,v 1.19 2022/05/19 15:17:51 schwarze Exp $      */
 /*
- * Copyright (c) 2013-2015, 2017-2020 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2013-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
  * Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
  *
  * Permission to use, copy, modify, and distribute this software for any
@@ -82,6 +82,8 @@ struct ohash   *roffhash_alloc(enum roff_tok, enum roff_tok);
 enum roff_tok    roffhash_find(struct ohash *, const char *, size_t);
 void             roffhash_free(struct ohash *);
 
+enum mandoc_esc          roff_escape(const char *, const int, const int,
+                       int *, int *, int *, int *);
 void             roff_state_reset(struct roff_man *);
 void             roff_validate(struct roff_man *);