Make roff_expand() parse left-to-right rather than right-to-left.

author schwarze <schwarze@openbsd.org>

Thu, 19 May 2022 15:17:50 +0000 (15:17 +0000)

committer schwarze <schwarze@openbsd.org>

Thu, 19 May 2022 15:17:50 +0000 (15:17 +0000)
author schwarze <schwarze@openbsd.org>
Thu, 19 May 2022 15:17:50 +0000 (15:17 +0000)
committer schwarze <schwarze@openbsd.org>
Thu, 19 May 2022 15:17:50 +0000 (15:17 +0000)
diff --git a/usr.bin/mandoc/Makefile b/usr.bin/mandoc/Makefile

index 8d6fec5..f53c608 100644 (file)
--- a/usr.bin/mandoc/Makefile
+++ b/usr.bin/mandoc/Makefile
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile,v 1.118 2020/03/13 00:31:04 schwarze Exp $
+# $OpenBSD: Makefile,v 1.119 2022/05/19 15:17:50 schwarze Exp $
  
  .include <bsd.own.mk>
  
@@ -8,7 +8,8 @@ LDADD   += -lutil -lz
  
  SRCS=  mandoc_aux.c mandoc_ohash.c mandoc.c mandoc_msg.c mandoc_xr.c \
         arch.c chars.c msec.c preconv.c read.c tag.c
-SRCS+= roff.c roff_validate.c tbl.c tbl_opts.c tbl_layout.c tbl_data.c eqn.c
+SRCS+= roff.c roff_escape.c roff_validate.c
+SRCS+= tbl.c tbl_opts.c tbl_layout.c tbl_data.c eqn.c
  SRCS+= mdoc.c mdoc_argv.c mdoc_macro.c mdoc_state.c mdoc_validate.c \
         att.c st.c
  SRCS+= man_macro.c man.c man_validate.c
diff --git a/usr.bin/mandoc/mandoc.c b/usr.bin/mandoc/mandoc.c

index ce710c6..26861a9 100644 (file)
--- a/usr.bin/mandoc/mandoc.c
+++ b/usr.bin/mandoc/mandoc.c
@@ -1,7 +1,8 @@
-/* $OpenBSD: mandoc.c,v 1.88 2022/04/13 13:11:33 schwarze Exp $ */
+/* $OpenBSD: mandoc.c,v 1.89 2022/05/19 15:17:50 schwarze Exp $ */
  /*
- * Copyright (c) 2011-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
- * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2010, 2011, 2015, 2017, 2018, 2019, 2020, 2021
+ *               Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
   *
   * Permission to use, copy, modify, and distribute this software for any
   * purpose with or without fee is hereby granted, provided that the above
@@ -14,6 +15,11 @@
   * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
   * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
   * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Utility functions to handle end of sentence punctuation
+ * and dates and times, for use by mdoc(7) and man(7) parsers.
+ * Utility functions to handle fonts and numbers,
+ * for use by mandoc(1) parsers and formatters.
   */
  #include <sys/types.h>
  
@@ -89,388 +95,6 @@ mandoc_font(const char *cp, int sz)
         }
  }
  
-enum mandoc_esc
-mandoc_escape(const char **end, const char **start, int *sz)
-{
-       const char      *local_start;
-       int              local_sz, c, i;
-       char             term;
-       enum mandoc_esc  gly;
-
-       /*
-        * When the caller doesn't provide return storage,
-        * use local storage.
-        */
-
-       if (NULL == start)
-               start = &local_start;
-       if (NULL == sz)
-               sz = &local_sz;
-
-       /*
-        * Treat "\E" just like "\";
-        * it only makes a difference in copy mode.
-        */
-
-       while (**end == 'E')
-               ++*end;
-
-       /*
-        * Beyond the backslash, at least one input character
-        * is part of the escape sequence.  With one exception
-        * (see below), that character won't be returned.
-        */
-
-       gly = ESCAPE_ERROR;
-       *start = ++*end;
-       *sz = 0;
-       term = '\0';
-
-       switch ((*start)[-1]) {
-       /*
-        * First the glyphs.  There are several different forms of
-        * these, but each eventually returns a substring of the glyph
-        * name.
-        */
-       case '(':
-               gly = ESCAPE_SPECIAL;
-               *sz = 2;
-               break;
-       case '[':
-               if (**start == ' ') {
-                       ++*end;
-                       return ESCAPE_ERROR;
-               }
-               gly = ESCAPE_SPECIAL;
-               term = ']';
-               break;
-       case 'C':
-               if ('\'' != **start)
-                       return ESCAPE_ERROR;
-               *start = ++*end;
-               gly = ESCAPE_SPECIAL;
-               term = '\'';
-               break;
-
-       /*
-        * Escapes taking no arguments at all.
-        */
-       case '!':
-       case '?':
-               return ESCAPE_UNSUPP;
-       case '%':
-       case '&':
-       case ')':
-       case ',':
-       case '/':
-       case '^':
-       case 'a':
-       case 'd':
-       case 'r':
-       case 't':
-       case 'u':
-       case '{':
-       case '|':
-       case '}':
-               return ESCAPE_IGNORE;
-       case 'c':
-               return ESCAPE_NOSPACE;
-       case 'p':
-               return ESCAPE_BREAK;
-
-       /*
-        * The \z escape is supposed to output the following
-        * character without advancing the cursor position.
-        * Since we are mostly dealing with terminal mode,
-        * let us just skip the next character.
-        */
-       case 'z':
-               return ESCAPE_SKIPCHAR;
-
-       /*
-        * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
-        * 'X' is the trigger.  These have opaque sub-strings.
-        */
-       case 'F':
-       case 'f':
-       case 'g':
-       case 'k':
-       case 'M':
-       case 'm':
-       case 'n':
-       case 'O':
-       case 'V':
-       case 'Y':
-       case '*':
-               switch ((*start)[-1]) {
-               case 'f':
-                       gly = ESCAPE_FONT;
-                       break;
-               case '*':
-                       gly = ESCAPE_DEVICE;
-                       break;
-               default:
-                       gly = ESCAPE_IGNORE;
-                       break;
-               }
-               switch (**start) {
-               case '(':
-                       if ((*start)[-1] == 'O')
-                               gly = ESCAPE_ERROR;
-                       *start = ++*end;
-                       *sz = 2;
-                       break;
-               case '[':
-                       if ((*start)[-1] == 'O')
-                               gly = (*start)[1] == '5' ?
-                                   ESCAPE_UNSUPP : ESCAPE_ERROR;
-                       *start = ++*end;
-                       term = ']';
-                       break;
-               default:
-                       if ((*start)[-1] == 'O') {
-                               switch (**start) {
-                               case '0':
-                                       gly = ESCAPE_UNSUPP;
-                                       break;
-                               case '1':
-                               case '2':
-                               case '3':
-                               case '4':
-                                       break;
-                               default:
-                                       gly = ESCAPE_ERROR;
-                                       break;
-                               }
-                       }
-                       *sz = 1;
-                       break;
-               }
-               break;
-
-       /*
-        * These escapes are of the form \X'Y', where 'X' is the trigger
-        * and 'Y' is any string.  These have opaque sub-strings.
-        * The \B and \w escapes are handled in roff.c, roff_res().
-        */
-       case 'A':
-       case 'b':
-       case 'D':
-       case 'R':
-       case 'X':
-       case 'Z':
-               gly = ESCAPE_IGNORE;
-               /* FALLTHROUGH */
-       case 'o':
-               if (**start == '\0')
-                       return ESCAPE_ERROR;
-               if (gly == ESCAPE_ERROR)
-                       gly = ESCAPE_OVERSTRIKE;
-               term = **start;
-               *start = ++*end;
-               break;
-
-       /*
-        * These escapes are of the form \X'N', where 'X' is the trigger
-        * and 'N' resolves to a numerical expression.
-        */
-       case 'h':
-       case 'H':
-       case 'L':
-       case 'l':
-       case 'S':
-       case 'v':
-       case 'x':
-               if (strchr(" %&()*+-./0123456789:<=>", **start)) {
-                       if ('\0' != **start)
-                               ++*end;
-                       return ESCAPE_ERROR;
-               }
-               switch ((*start)[-1]) {
-               case 'h':
-                       gly = ESCAPE_HORIZ;
-                       break;
-               case 'l':
-                       gly = ESCAPE_HLINE;
-                       break;
-               default:
-                       gly = ESCAPE_IGNORE;
-                       break;
-               }
-               term = **start;
-               *start = ++*end;
-               break;
-
-       /*
-        * Special handling for the numbered character escape.
-        * XXX Do any other escapes need similar handling?
-        */
-       case 'N':
-               if ('\0' == **start)
-                       return ESCAPE_ERROR;
-               (*end)++;
-               if (isdigit((unsigned char)**start)) {
-                       *sz = 1;
-                       return ESCAPE_IGNORE;
-               }
-               (*start)++;
-               while (isdigit((unsigned char)**end))
-                       (*end)++;
-               *sz = *end - *start;
-               if ('\0' != **end)
-                       (*end)++;
-               return ESCAPE_NUMBERED;
-
-       /*
-        * Sizes get a special category of their own.
-        */
-       case 's':
-               gly = ESCAPE_IGNORE;
-
-               /* See +/- counts as a sign. */
-               if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
-                       *start = ++*end;
-
-               switch (**end) {
-               case '(':
-                       *start = ++*end;
-                       *sz = 2;
-                       break;
-               case '[':
-                       *start = ++*end;
-                       term = ']';
-                       break;
-               case '\'':
-                       *start = ++*end;
-                       term = '\'';
-                       break;
-               case '3':
-               case '2':
-               case '1':
-                       *sz = (*end)[-1] == 's' &&
-                           isdigit((unsigned char)(*end)[1]) ? 2 : 1;
-                       break;
-               default:
-                       *sz = 1;
-                       break;
-               }
-
-               break;
-
-       /*
-        * Several special characters can be encoded as
-        * one-byte escape sequences without using \[].
-        */
-       case ' ':
-       case '\'':
-       case '-':
-       case '.':
-       case '0':
-       case ':':
-       case '_':
-       case '`':
-       case 'e':
-       case '~':
-               gly = ESCAPE_SPECIAL;
-               /* FALLTHROUGH */
-       default:
-               if (gly == ESCAPE_ERROR)
-                       gly = ESCAPE_UNDEF;
-               *start = --*end;
-               *sz = 1;
-               break;
-       }
-
-       /*
-        * Read up to the terminating character,
-        * paying attention to nested escapes.
-        */
-
-       if ('\0' != term) {
-               while (**end != term) {
-                       switch (**end) {
-                       case '\0':
-                               return ESCAPE_ERROR;
-                       case '\\':
-                               (*end)++;
-                               if (ESCAPE_ERROR ==
-                                   mandoc_escape(end, NULL, NULL))
-                                       return ESCAPE_ERROR;
-                               break;
-                       default:
-                               (*end)++;
-                               break;
-                       }
-               }
-               *sz = (*end)++ - *start;
-
-               /*
-                * The file chars.c only provides one common list
-                * of character names, but \[-] == \- is the only
-                * one of the characters with one-byte names that
-                * allows enclosing the name in brackets.
-                */
-               if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
-                       return ESCAPE_ERROR;
-       } else {
-               assert(*sz > 0);
-               if ((size_t)*sz > strlen(*start))
-                       return ESCAPE_ERROR;
-               *end += *sz;
-       }
-
-       /* Run post-processors. */
-
-       switch (gly) {
-       case ESCAPE_FONT:
-               gly = mandoc_font(*start, *sz);
-               break;
-       case ESCAPE_SPECIAL:
-               if (**start == 'c') {
-                       if (*sz < 6 || *sz > 7 ||
-                           strncmp(*start, "char", 4) != 0 ||
-                           (int)strspn(*start + 4, "0123456789") + 4 < *sz)
-                               break;
-                       c = 0;
-                       for (i = 4; i < *sz; i++)
-                               c = 10 * c + ((*start)[i] - '0');
-                       if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
-                               break;
-                       *start += 4;
-                       *sz -= 4;
-                       gly = ESCAPE_NUMBERED;
-                       break;
-               }
-
-               /*
-                * Unicode escapes are defined in groff as \[u0000]
-                * to \[u10FFFF], where the contained value must be
-                * a valid Unicode codepoint.  Here, however, only
-                * check the length and range.
-                */
-               if (**start != 'u' || *sz < 5 || *sz > 7)
-                       break;
-               if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
-                       break;
-               if (*sz == 6 && (*start)[1] == '0')
-                       break;
-               if (*sz == 5 && (*start)[1] == 'D' &&
-                   strchr("89ABCDEF", (*start)[2]) != NULL)
-                       break;
-               if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
-                   + 1 == *sz)
-                       gly = ESCAPE_UNICODE;
-               break;
-       case ESCAPE_DEVICE:
-               assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T');
-               break;
-       default:
-               break;
-       }
-
-       return gly;
-}
-
  static int
  a2time(time_t *t, const char *fmt, const char *p)
  {
diff --git a/usr.bin/mandoc/mandoc.h b/usr.bin/mandoc/mandoc.h

index 717b334..6a53a29 100644 (file)
--- a/usr.bin/mandoc/mandoc.h
+++ b/usr.bin/mandoc/mandoc.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: mandoc.h,v 1.218 2022/04/28 16:16:46 schwarze Exp $ */
+/* $OpenBSD: mandoc.h,v 1.219 2022/05/19 15:17:50 schwarze Exp $ */
  /*
   * Copyright (c) 2012-2022 Ingo Schwarze <schwarze@openbsd.org>
   * Copyright (c) 2010, 2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
@@ -285,11 +285,12 @@ enum      mandocerr {
  };
  
  enum   mandoc_esc {
-       ESCAPE_ERROR = 0, /* bail! unparsable escape */
-       ESCAPE_UNSUPP, /* unsupported escape; ignore it */
-       ESCAPE_IGNORE, /* escape to be ignored */
-       ESCAPE_UNDEF, /* undefined escape; print literal character */
-       ESCAPE_SPECIAL, /* a regular special character */
+       ESCAPE_EXPAND = 0, /* interpolation and iterative call needed */
+       ESCAPE_ERROR, /* non-fatal error: unparsable escape */
+       ESCAPE_UNSUPP, /* unsupported escape: warn and ignore */
+       ESCAPE_IGNORE, /* valid escape to be ignored */
+       ESCAPE_UNDEF, /* undefined escape: print literal character */
+       ESCAPE_SPECIAL, /* special character escape */
         ESCAPE_FONT, /* a generic font mode */
         ESCAPE_FONTBOLD, /* bold font mode */
         ESCAPE_FONTITALIC, /* italic font mode */
diff --git a/usr.bin/mandoc/roff.c b/usr.bin/mandoc/roff.c

index 4a93d34..783eba5 100644 (file)
--- a/usr.bin/mandoc/roff.c
+++ b/usr.bin/mandoc/roff.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: roff.c,v 1.259 2022/05/01 16:18:59 schwarze Exp $ */
+/* $OpenBSD: roff.c,v 1.260 2022/05/19 15:17:51 schwarze Exp $ */
  /*
   * Copyright (c) 2010-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
   * Copyright (c) 2008-2012, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
@@ -205,6 +205,8 @@ static      int              roff_evalpar(struct roff *, int,
  static int              roff_evalstrcond(const char *, int *);
  static int              roff_expand(struct roff *, struct buf *,
                                 int, int, char);
+static void             roff_expand_patch(struct buf *, int,
+                               const char *, int);
  static void             roff_free1(struct roff *);
  static void             roff_freereg(struct roffreg *);
  static void             roff_freestr(struct roffkv *);
@@ -1231,9 +1233,15 @@ deroff(char **dest, const struct roff_node *n)
  
  /* --- main functions of the roff parser ---------------------------------- */
  
+/*
+ * Save comments preceding the title macro, for example in order to
+ * preserve Copyright and license headers in HTML output,
+ * provide diagnostics about RCS ids and trailing whitespace in comments,
+ * then discard comments including preceding whitespace.
+ * This function also handles input line continuation.
+ */
  static int
-roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
-    char newesc)
+roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos, char ec)
  {
         struct roff_node *n;    /* used for header comments */
         const char      *start; /* start of the string to process */
@@ -1243,15 +1251,39 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
         int              rcsid; /* kind of RCS id seen */
  
         for (start = stesc = buf->buf + pos;; stesc++) {
+               /*
+                * XXX Ugly hack: Remove the newline character that
+                * mparse_buf_r() appended to mark the end of input
+                * if it is not preceded by an escape character.
+                */
+               if (stesc[0] == '\n') {
+                       assert(stesc[1] == '\0');
+                       stesc[0] = '\0';
+               }
+
                 /* The line ends without continuation or comment. */
                 if (stesc[0] == '\0')
                         return ROFF_CONT;
  
                 /* Unescaped byte: skip it. */
-               if (stesc[0] != newesc)
+               if (stesc[0] != ec)
                         continue;
  
-               /* Backslash at end of line requests line continuation. */
+               /*
+                * XXX Ugly hack: Do not attempt to append another line
+                * if the function mparse_buf_r() appended a newline
+                * character to indicate the end of input.
+                */
+               if (stesc[1] == '\n') {
+                       assert(stesc[2] == '\0');
+                       stesc[0] = '\0';
+                       return ROFF_CONT;
+               }
+
+               /*
+                * An escape character at the end of an input line
+                * requests line continuation.
+                */
                 if (stesc[1] == '\0') {
                         stesc[0] = '\0';
                         return ROFF_IGN | ROFF_APPEND;
@@ -1262,7 +1294,7 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
                         break;
  
                 /* Escaped escape character: skip them both. */
-               if (stesc[1] == newesc)
+               if (stesc[1] == ec)
                         stesc++;
         }
  
@@ -1329,324 +1361,217 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
   * which typically produce output glyphs or change formatter state.
   */
  static int
-roff_expand(struct roff *r, struct buf *buf, int ln, int pos, char newesc)
+roff_expand(struct roff *r, struct buf *buf, int ln, int pos, char ec)
  {
-       struct mctx     *ctx;   /* current macro call context */
-       char             ubuf[24]; /* buffer to print the number */
-       const char      *start; /* start of the string to process */
-       char            *stesc; /* start of an escape sequence ('\\') */
-       const char      *esct;  /* type of esccape sequence */
-       const char      *stnam; /* start of the name, after "[(*" */
-       const char      *cp;    /* end of the name, e.g. before ']' */
-       const char      *res;   /* the string to be substituted */
-       char            *nbuf;  /* new buffer to copy buf->buf to */
-       size_t           maxl;  /* expected length of the escape name */
-       size_t           naml;  /* actual length of the escape name */
-       size_t           asz;   /* length of the replacement */
-       size_t           rsz;   /* length of the rest of the string */
-       int              inaml; /* length returned from mandoc_escape() */
+       char             ubuf[24];      /* buffer to print a number */
+       struct mctx     *ctx;           /* current macro call context */
+       const char      *res;           /* the string to be pasted */
+       const char      *src;           /* source for copying */
+       char            *dst;           /* destination for copying */
+       int              iesc;          /* index of leading escape char */
+       int              inam;          /* index of the escape name */
+       int              iarg;          /* index beginning the argument */
+       int              iendarg;       /* index right after the argument */
+       int              iend;          /* index right after the sequence */
+       int              deftype;       /* type of definition to paste */
+       int              argi;          /* macro argument index */
+       int              quote_args;    /* true for \\$@, false for \\$* */
+       int              asz;           /* length of the replacement */
+       int              rsz;           /* length of the rest of the string */
+       int              npos;          /* position in numeric expression */
         int              expand_count;  /* to avoid infinite loops */
-       int              npos;  /* position in numeric expression */
-       int              arg_complete; /* argument not interrupted by eol */
-       int              quote_args; /* true for \\$@, false for \\$* */
-       int              deftype; /* type of definition to paste */
-       enum mandocerr   err;   /* for escape sequence problems */
-       char             sign;  /* increment number register */
-       char             term;  /* character terminating the escape */
-
-       start = buf->buf + pos;
-       stesc = strchr(start, '\0') - 1;
-       if (stesc >= start && *stesc == '\n')
-               *stesc-- = '\0';
  
         expand_count = 0;
-       while (stesc >= start) {
-               if (*stesc != newesc) {
+       while (buf->buf[pos] != '\0') {
  
-                       /*
-                        * If we have a non-standard escape character,
-                        * escape literal backslashes because all
-                        * processing in subsequent functions uses
-                        * the standard escaping rules.
-                        */
+               /*
+                * Skip plain ASCII characters.
+                * If we have a non-standard escape character,
+                * escape literal backslashes because all processing in
+                * subsequent functions uses the standard escaping rules.
+                */
  
-                       if (newesc != ASCII_ESC && *stesc == '\\') {
-                               *stesc = '\0';
-                               buf->sz = mandoc_asprintf(&nbuf, "%s\\e%s",
-                                   buf->buf, stesc + 1) + 1;
-                               start = nbuf + pos;
-                               stesc = nbuf + (stesc - buf->buf);
-                               free(buf->buf);
-                               buf->buf = nbuf;
+               if (buf->buf[pos] != ec) {
+                       if (ec != ASCII_ESC && buf->buf[pos] == '\\') {
+                               roff_expand_patch(buf, pos, "\\e", pos + 1);
+                               pos++;
                         }
-
-                       /* Search backwards for the next escape. */
-
-                       stesc--;
+                       pos++;
                         continue;
                 }
  
-               /* If it is escaped, skip it. */
-
-               for (cp = stesc - 1; cp >= start; cp--)
-                       if (*cp != r->escape)
-                               break;
-
-               if ((stesc - cp) % 2 == 0) {
-                       while (stesc > cp)
-                               *stesc-- = '\\';
-                       continue;
-               } else if (stesc[1] == '\0') {
-                       *stesc-- = '\0';
-                       continue;
-               } else
-                       *stesc = '\\';
-
-               /* Decide whether to expand or to check only. */
+               /*
+                * Parse escape sequences,
+                * issue diagnostic messages when appropriate,
+                * and skip sequences that do not need expansion.
+                * If we have a non-standard escape character, translate
+                * it to backslashes and translate backslashes to \e.
+                */
  
-               term = '\0';
-               cp = stesc + 1;
-               while (*cp == 'E')
-                       cp++;
-               esct = cp;
-               switch (*esct) {
-               case '*':
-               case '$':
-                       res = NULL;
-                       break;
-               case 'B':
-               case 'w':
-                       term = cp[1];
-                       /* FALLTHROUGH */
-               case 'n':
-                       sign = cp[1];
-                       if (sign == '+' || sign == '-')
-                               cp++;
-                       res = ubuf;
-                       break;
-               default:
-                       err = MANDOCERR_OK;
-                       switch(mandoc_escape(&cp, &stnam, &inaml)) {
-                       case ESCAPE_SPECIAL:
-                               if (mchars_spec2cp(stnam, inaml) >= 0)
-                                       break;
-                               /* FALLTHROUGH */
-                       case ESCAPE_ERROR:
-                               err = MANDOCERR_ESC_BAD;
-                               break;
-                       case ESCAPE_UNDEF:
-                               err = MANDOCERR_ESC_UNDEF;
-                               break;
-                       case ESCAPE_UNSUPP:
-                               err = MANDOCERR_ESC_UNSUPP;
-                               break;
-                       default:
-                               break;
+               if (roff_escape(buf->buf, ln, pos,
+                   &iesc, &iarg, &iendarg, &iend) != ESCAPE_EXPAND) {
+                       while (pos < iend) {
+                               if (buf->buf[pos] == ec) {
+                                       buf->buf[pos] = '\\';
+                                       if (pos + 1 < iend)
+                                               pos++;
+                               } else if (buf->buf[pos] == '\\') {
+                                       roff_expand_patch(buf,
+                                           pos, "\\e", pos + 1);
+                                       pos++;
+                                       iend++;
+                               }
+                               pos++;
                         }
-                       if (err != MANDOCERR_OK)
-                               mandoc_msg(err, ln, (int)(stesc - buf->buf),
-                                   "%.*s", (int)(cp - stesc), stesc);
-                       stesc--;
                         continue;
                 }
  
-               if (EXPAND_LIMIT < ++expand_count) {
-                       mandoc_msg(MANDOCERR_ROFFLOOP,
-                           ln, (int)(stesc - buf->buf), NULL);
-                       return ROFF_IGN;
-               }
-
                 /*
-                * The third character decides the length
-                * of the name of the string or register.
-                * Save a pointer to the name.
+                * Treat "\E" just like "\";
+                * it only makes a difference in copy mode.
                  */
  
-               if (term == '\0') {
-                       switch (*++cp) {
-                       case '\0':
-                               maxl = 0;
-                               break;
-                       case '(':
-                               cp++;
-                               maxl = 2;
-                               break;
-                       case '[':
-                               cp++;
-                               term = ']';
-                               maxl = 0;
-                               break;
-                       default:
-                               maxl = 1;
-                               break;
-                       }
-               } else {
-                       cp += 2;
-                       maxl = 0;
-               }
-               stnam = cp;
+               inam = iesc + 1;
+               while (buf->buf[inam] == 'E')
+                       inam++;
  
-               /* Advance to the end of the name. */
+               /* Handle expansion. */
  
-               naml = 0;
-               arg_complete = 1;
-               while (maxl == 0 || naml < maxl) {
-                       if (*cp == '\0') {
-                               mandoc_msg(MANDOCERR_ESC_BAD, ln,
-                                   (int)(stesc - buf->buf), "%s", stesc);
-                               arg_complete = 0;
-                               break;
-                       }
-                       if (maxl == 0 && *cp == term) {
-                               cp++;
-                               break;
-                       }
-                       if (*cp++ != '\\' || *esct != 'w') {
-                               naml++;
-                               continue;
-                       }
-                       switch (mandoc_escape(&cp, NULL, NULL)) {
-                       case ESCAPE_SPECIAL:
-                       case ESCAPE_UNICODE:
-                       case ESCAPE_NUMBERED:
-                       case ESCAPE_UNDEF:
-                       case ESCAPE_OVERSTRIKE:
-                               naml++;
+               res = NULL;
+               switch (buf->buf[inam]) {
+               case '*':
+                       if (iendarg == iarg)
                                 break;
-                       default:
+                       deftype = ROFFDEF_USER | ROFFDEF_PRE;
+                       if ((res = roff_getstrn(r, buf->buf + iarg,
+                           iendarg - iarg, &deftype)) != NULL)
                                 break;
-                       }
-               }
  
-               /*
-                * Retrieve the replacement string; if it is
-                * undefined, resume searching for escapes.
-                */
+                       /*
+                        * If not overriden,
+                        * let \*(.T through to the formatters.
+                        */
  
-               switch (*esct) {
-               case '*':
-                       if (arg_complete) {
-                               deftype = ROFFDEF_USER | ROFFDEF_PRE;
-                               res = roff_getstrn(r, stnam, naml, &deftype);
-
-                               /*
-                                * If not overriden, let \*(.T
-                                * through to the formatters.
-                                */
-
-                               if (res == NULL && naml == 2 &&
-                                   stnam[0] == '.' && stnam[1] == 'T') {
-                                       roff_setstrn(&r->strtab,
-                                           ".T", 2, NULL, 0, 0);
-                                       stesc--;
-                                       continue;
-                               }
+                       if (iendarg - iarg == 2 &&
+                           buf->buf[iarg] == '.' &&
+                           buf->buf[iarg + 1] == 'T') {
+                               roff_setstrn(&r->strtab, ".T", 2, NULL, 0, 0);
+                               pos = iend;
+                               continue;
                         }
+
+                       mandoc_msg(MANDOCERR_STR_UNDEF, ln, iesc,
+                           "%.*s", iendarg - iarg, buf->buf + iarg);
                         break;
+
                 case '$':
                         if (r->mstackpos < 0) {
-                               mandoc_msg(MANDOCERR_ARG_UNDEF, ln,
-                                   (int)(stesc - buf->buf), "%.3s", stesc);
+                               mandoc_msg(MANDOCERR_ARG_UNDEF, ln, iesc,
+                                   "%.*s", iend - iesc, buf->buf + iesc);
                                 break;
                         }
                         ctx = r->mstack + r->mstackpos;
-                       npos = esct[1] - '1';
-                       if (npos >= 0 && npos <= 8) {
-                               res = npos < ctx->argc ?
-                                   ctx->argv[npos] : "";
+                       argi = buf->buf[iarg] - '1';
+                       if (argi >= 0 && argi <= 8) {
+                               if (argi < ctx->argc)
+                                       res = ctx->argv[argi];
                                 break;
                         }
-                       if (esct[1] == '*')
+                       if (buf->buf[iarg] == '*')
                                 quote_args = 0;
-                       else if (esct[1] == '@')
+                       else if (buf->buf[iarg] == '@')
                                 quote_args = 1;
                         else {
-                               mandoc_msg(MANDOCERR_ARG_NONUM, ln,
-                                   (int)(stesc - buf->buf), "%.3s", stesc);
+                               mandoc_msg(MANDOCERR_ARG_NONUM, ln, iesc,
+                                   "%.*s", iend - iesc, buf->buf + iesc);
                                 break;
                         }
                         asz = 0;
-                       for (npos = 0; npos < ctx->argc; npos++) {
-                               if (npos)
+                       for (argi = 0; argi < ctx->argc; argi++) {
+                               if (argi)
                                         asz++;  /* blank */
                                 if (quote_args)
                                         asz += 2;  /* quotes */
-                               asz += strlen(ctx->argv[npos]);
+                               asz += strlen(ctx->argv[argi]);
                         }
-                       if (asz != 3) {
-                               rsz = buf->sz - (stesc - buf->buf) - 3;
-                               if (asz < 3)
-                                       memmove(stesc + asz, stesc + 3, rsz);
-                               buf->sz += asz - 3;
-                               nbuf = mandoc_realloc(buf->buf, buf->sz);
-                               start = nbuf + pos;
-                               stesc = nbuf + (stesc - buf->buf);
-                               buf->buf = nbuf;
-                               if (asz > 3)
-                                       memmove(stesc + asz, stesc + 3, rsz);
+                       if (asz != iend - iesc) {
+                               rsz = buf->sz - iend;
+                               if (asz < iend - iesc)
+                                       memmove(buf->buf + iesc + asz,
+                                           buf->buf + iend, rsz);
+                               buf->sz = iesc + asz + rsz;
+                               buf->buf = mandoc_realloc(buf->buf, buf->sz);
+                               if (asz > iend - iesc)
+                                       memmove(buf->buf + iesc + asz,
+                                           buf->buf + iend, rsz);
                         }
-                       for (npos = 0; npos < ctx->argc; npos++) {
-                               if (npos)
-                                       *stesc++ = ' ';
+                       dst = buf->buf + iesc;
+                       for (argi = 0; argi < ctx->argc; argi++) {
+                               if (argi)
+                                       *dst++ = ' ';
                                 if (quote_args)
-                                       *stesc++ = '"';
-                               cp = ctx->argv[npos];
-                               while (*cp != '\0')
-                                       *stesc++ = *cp++;
+                                       *dst++ = '"';
+                               src = ctx->argv[argi];
+                               while (*src != '\0')
+                                       *dst++ = *src++;
                                 if (quote_args)
-                                       *stesc++ = '"';
+                                       *dst++ = '"';
                         }
                         continue;
                 case 'B':
                         npos = 0;
-                       ubuf[0] = arg_complete &&
-                           roff_evalnum(r, ln, stnam, &npos,
-                             NULL, ROFFNUM_SCALE) &&
-                           stnam + npos + 1 == cp ? '1' : '0';
+                       ubuf[0] = iendarg > iarg && iend > iendarg &&
+                           roff_evalnum(r, ln, buf->buf + iarg, &npos,
+                                        NULL, ROFFNUM_SCALE) &&
+                           npos == iendarg - iarg ? '1' : '0';
                         ubuf[1] = '\0';
+                       res = ubuf;
                         break;
                 case 'n':
-                       if (arg_complete)
+                       if (iendarg > iarg)
                                 (void)snprintf(ubuf, sizeof(ubuf), "%d",
-                                   roff_getregn(r, stnam, naml, sign));
+                                   roff_getregn(r, buf->buf + iarg,
+                                   iendarg - iarg, buf->buf[inam + 1]));
                         else
                                 ubuf[0] = '\0';
+                       res = ubuf;
                         break;
                 case 'w':
-                       /* use even incomplete args */
-                       (void)snprintf(ubuf, sizeof(ubuf), "%d",
-                           24 * (int)naml);
+                       (void)snprintf(ubuf, sizeof(ubuf),
+                           "%d", (iendarg - iarg) * 24);
+                       res = ubuf;
+                       break;
+               default:
                         break;
                 }
-
-               if (res == NULL) {
-                       if (*esct == '*')
-                               mandoc_msg(MANDOCERR_STR_UNDEF,
-                                   ln, (int)(stesc - buf->buf),
-                                   "%.*s", (int)naml, stnam);
+               if (res == NULL)
                         res = "";
-               } else if (buf->sz + strlen(res) > SHRT_MAX) {
-                       mandoc_msg(MANDOCERR_ROFFLOOP,
-                           ln, (int)(stesc - buf->buf), NULL);
+               if (++expand_count > EXPAND_LIMIT ||
+                   buf->sz + strlen(res) > SHRT_MAX) {
+                       mandoc_msg(MANDOCERR_ROFFLOOP, ln, iesc, NULL);
                         return ROFF_IGN;
                 }
-
-               /* Replace the escape sequence by the string. */
-
-               *stesc = '\0';
-               buf->sz = mandoc_asprintf(&nbuf, "%s%s%s",
-                   buf->buf, res, cp) + 1;
-
-               /* Prepare for the next replacement. */
-
-               start = nbuf + pos;
-               stesc = nbuf + (stesc - buf->buf) + strlen(res);
-               free(buf->buf);
-               buf->buf = nbuf;
+               roff_expand_patch(buf, iesc, res, iend);
         }
         return ROFF_CONT;
  }
  
+/*
+ * Replace the substring from the start position (inclusive)
+ * to end position (exclusive) with the repl(acement) string.
+ */
+static void
+roff_expand_patch(struct buf *buf, int start, const char *repl, int end)
+{
+       char    *nbuf;
+
+       buf->buf[start] = '\0';
+       buf->sz = mandoc_asprintf(&nbuf, "%s%s%s", buf->buf, repl,
+           buf->buf + end) + 1;
+       free(buf->buf);
+       buf->buf = nbuf;
+}
+
  /*
   * Parse a quoted or unquoted roff-style request or macro argument.
   * Return a pointer to the parsed argument, which is either the original
diff --git a/usr.bin/mandoc/roff_escape.c b/usr.bin/mandoc/roff_escape.c

new file mode 100644 (file)

index 0000000..4571873
--- /dev/null
+++ b/usr.bin/mandoc/roff_escape.c
@@ -0,0 +1,477 @@
+/* $OpenBSD: roff_escape.c,v 1.1 2022/05/19 15:17:51 schwarze Exp $ */
+/*
+ * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
+ *               Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Parser for roff(7) escape sequences.
+ * To be used by all mandoc(1) parsers and formatters.
+ */
+#include <assert.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "mandoc.h"
+#include "roff.h"
+#include "roff_int.h"
+
+/*
+ * Traditional escape sequence interpreter for general use
+ * including in high-level formatters.  This function does not issue
+ * diagnostics and is not usable for expansion in the roff(7) parser.
+ * It is documented in the mandoc_escape(3) manual page.
+ */
+enum mandoc_esc
+mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
+{
+        int             iarg, iendarg, iend;
+        enum mandoc_esc  rval;
+
+        rval = roff_escape(--*rendarg, 0, 0, NULL, &iarg, &iendarg, &iend);
+        assert(rval != ESCAPE_EXPAND);
+        if (rarg != NULL)
+              *rarg = *rendarg + iarg;
+        if (rargl != NULL)
+              *rargl = iendarg - iarg;
+        *rendarg += iend;
+        return rval;
+}
+
+/*
+ * Full-featured escape sequence parser.
+ * If it encounters a nested escape sequence that requires expansion
+ * by the parser and re-parsing, the positions of that inner escape
+ * sequence are returned in *resc ... *rend.
+ * Otherwise, *resc is set to aesc and the positions of the escape
+ * sequence starting at aesc are returned.
+ * Diagnostic messages are generated if and only if resc != NULL,
+ * that is, if and only if called by roff_expand().
+ */
+enum mandoc_esc
+roff_escape(const char *buf, const int ln, const int aesc,
+    int *resc, int *rarg, int *rendarg, int *rend)
+{
+       int              iesc;          /* index of leading escape char */
+       int              iarg;          /* index beginning the argument */
+       int              iendarg;       /* index right after the argument */
+       int              iend;          /* index right after the sequence */
+       int              sesc, sarg, sendarg, send; /* for sub-escape */
+       int              maxl;          /* expected length of the argument */
+       int              argl;          /* actual length of the argument */
+       int              c, i;          /* for \[char...] parsing */
+       enum mandoc_esc  rval;          /* return value */
+       enum mandocerr   err;           /* diagnostic code */
+       char             esc_name;
+       char             term;          /* byte terminating the argument */
+
+       /*
+        * Treat "\E" just like "\";
+        * it only makes a difference in copy mode.
+        */
+
+       iesc = iarg = aesc;
+       do {
+               iarg++;
+       } while (buf[iarg] == 'E');
+
+       /*
+        * Sort the following cases first by syntax category,
+        * then by escape sequence type, and finally by ASCII code.
+        */
+
+       esc_name = buf[iarg];
+       iendarg = iend = ++iarg;
+       maxl = INT_MAX;
+       term = '\0';
+       switch (esc_name) {
+
+       /* Escape sequences taking no arguments at all. */
+
+       case '!':
+       case '?':
+               rval = ESCAPE_UNSUPP;
+               goto out;
+
+       case '%':
+       case '&':
+       case ')':
+       case ',':
+       case '/':
+       case '^':
+       case 'a':
+       case 'd':
+       case 'r':
+       case 't':
+       case 'u':
+       case '{':
+       case '|':
+       case '}':
+               rval = ESCAPE_IGNORE;
+               goto out;
+
+       case '\\':
+       default:
+               iarg--;
+               rval = ESCAPE_UNDEF;
+               goto out;
+
+       case ' ':
+       case '\'':
+       case '-':
+       case '.':
+       case '0':
+       case ':':
+       case '_':
+       case '`':
+       case 'e':
+       case '~':
+               iarg--;
+               argl = 1;
+               rval = ESCAPE_SPECIAL;
+               goto out;
+       case 'p':
+               rval = ESCAPE_BREAK;
+               goto out;
+       case 'c':
+               rval = ESCAPE_NOSPACE;
+               goto out;
+       case 'z':
+               rval = ESCAPE_SKIPCHAR;
+               goto out;
+
+       /* Standard argument format. */
+
+       case '$':
+       case '*':
+       case 'n':
+               rval = ESCAPE_EXPAND;
+               break;
+       case 'F':
+       case 'M':
+       case 'O':
+       case 'V':
+       case 'Y':
+       case 'g':
+       case 'k':
+       case 'm':
+               rval = ESCAPE_IGNORE;
+               break;
+       case '(':
+       case '[':
+               rval = ESCAPE_SPECIAL;
+               iendarg = iend = --iarg;
+               break;
+       case 'f':
+               rval = ESCAPE_FONT;
+               break;
+
+       /* Quoted arguments */
+
+       case 'B':
+       case 'w':
+               rval = ESCAPE_EXPAND;
+               term = '\b';
+               break;
+       case 'A':
+       case 'D':
+       case 'H':
+       case 'L':
+       case 'R':
+       case 'S':
+       case 'X':
+       case 'Z':
+       case 'b':
+       case 'v':
+       case 'x':
+               rval = ESCAPE_IGNORE;
+               term = '\b';
+               break;
+       case 'C':
+               if (buf[iarg] != '\'') {
+                       rval = ESCAPE_ERROR;
+                       goto out;
+               }
+               rval = ESCAPE_SPECIAL;
+               term = '\b';
+               break;
+       case 'N':
+               rval = ESCAPE_NUMBERED;
+               term = '\b';
+               break;
+       case 'h':
+               rval = ESCAPE_HORIZ;
+               term = '\b';
+               break;
+       case 'l':
+               rval = ESCAPE_HLINE;
+               term = '\b';
+               break;
+       case 'o':
+               rval = ESCAPE_OVERSTRIKE;
+               term = '\b';
+               break;
+
+       /* Sizes support both forms, with additional peculiarities. */
+
+       case 's':
+               rval = ESCAPE_IGNORE;
+               if (buf[iarg] == '+' || buf[iarg] == '-'||
+                   buf[iarg] == ASCII_HYPH)
+                       iarg++;
+               switch (buf[iarg]) {
+               case '(':
+                       maxl = 2;
+                       iarg++;
+                       break;
+               case '[':
+                       term = ']';
+                       iarg++;
+                       break;
+               case '\'':
+                       term = '\'';
+                       iarg++;
+                       break;
+               case '1':
+               case '2':
+               case '3':
+                       if (buf[iarg - 1] == 's' &&
+                           isdigit((unsigned char)buf[iarg + 1])) {
+                               maxl = 2;
+                               break;
+                       }
+                       /* FALLTHROUGH */
+               default:
+                       maxl = 1;
+                       break;
+               }
+               iendarg = iend = iarg;
+       }
+
+       /* Decide how to end the argument. */
+
+       if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
+           buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
+           &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
+               goto out_sub;
+
+       if (term == '\b') {
+               if ((esc_name == 'N' && isdigit((unsigned char)buf[iarg])) ||
+                   (esc_name == 'h' && strchr(" %&()*+-./0123456789:<=>",
+                    buf[iarg]) != NULL)) {
+                       iendarg = iend = iarg + 1;
+                       rval = ESCAPE_ERROR;
+                       goto out;
+               }
+               term = buf[iarg++];
+       } else if (term == '\0' && maxl == INT_MAX) {
+               if (esc_name == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
+                       iarg++;
+               switch (buf[iarg]) {
+               case '(':
+                       maxl = 2;
+                       iarg++;
+                       break;
+               case '[':
+                       if (buf[++iarg] == ' ') {
+                               iendarg = iend = iarg + 1;
+                               rval = ESCAPE_ERROR;
+                               goto out;
+                       }
+                       term = ']';
+                       break;
+               default:
+                       maxl = 1;
+                       break;
+               }
+       }
+
+       /* Advance to the end of the argument. */
+
+       iendarg = iarg;
+       while (maxl > 0) {
+               if (buf[iendarg] == '\0') {
+                       /* Ignore an incomplete argument except for \w. */
+                       if (esc_name != 'w')
+                               iendarg = iarg;
+                       break;
+               }
+               if (buf[iendarg] == term) {
+                       iend = iendarg + 1;
+                       break;
+               }
+               if (esc_name == 'N' &&
+                   isdigit((unsigned char)buf[iendarg]) == 0) {
+                       iend = iendarg + 1;
+                       break;
+               }
+               if (buf[iendarg] == buf[iesc]) {
+                       if (roff_escape(buf, ln, iendarg,
+                           &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
+                               goto out_sub;
+                       iendarg = iend = send;
+               } else {
+                       if (maxl != INT_MAX)
+                               maxl--;
+                       iend = ++iendarg;
+               }
+       }
+       if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
+           (term != '\0' && buf[iendarg] != term)))
+               mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);
+
+       /* Post-process depending on the content of the argument. */
+
+       argl = iendarg - iarg;
+       switch (esc_name) {
+       case '*':
+               if (resc == NULL && argl == 2 &&
+                   buf[iarg] == '.' && buf[iarg + 1] == 'T')
+                       rval = ESCAPE_DEVICE;
+               break;
+       case 'O':
+               switch (buf[iarg]) {
+               case '0':
+                       rval = ESCAPE_UNSUPP;
+                       break;
+               case '1':
+               case '2':
+               case '3':
+               case '4':
+                       rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
+                       break;
+               case '5':
+                       rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
+                           ESCAPE_ERROR;
+                       break;
+               default:
+                       rval = ESCAPE_ERROR;
+                       break;
+               }
+               break;
+       default:
+               break;
+       }
+
+       switch (rval) {
+       case ESCAPE_FONT:
+               rval = mandoc_font(buf + iarg, argl);
+               break;
+
+       case ESCAPE_SPECIAL:
+
+               /*
+                * The file chars.c only provides one common list of
+                * character names, but \[-] == \- is the only one of
+                * the characters with one-byte names that allows
+                * enclosing the name in brackets.
+                */
+
+               if (term != '\0' && argl == 1 && buf[iarg] != '-') {
+                       rval = ESCAPE_ERROR;
+                       break;
+               }
+
+               /* Treat \[char...] as an alias for \N'...'. */
+
+               if (buf[iarg] == 'c') {
+                       if (argl < 6 || argl > 7 ||
+                           strncmp(buf + iarg, "char", 4) != 0 ||
+                           (int)strspn(buf + iarg + 4, "0123456789")
+                            + 4 < argl)
+                               break;
+                       c = 0;
+                       for (i = iarg; i < iendarg; i++)
+                               c = 10 * c + (buf[i] - '0');
+                       if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
+                               break;
+                       iarg += 4;
+                       rval = ESCAPE_NUMBERED;
+                       break;
+               }
+
+               /*
+                * Unicode escapes are defined in groff as \[u0000]
+                * to \[u10FFFF], where the contained value must be
+                * a valid Unicode codepoint.  Here, however, only
+                * check the length and range.
+                */
+
+               if (buf[iarg] != 'u' || argl < 5 || argl > 7)
+                       break;
+               if (argl == 7 &&
+                   (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
+                       break;
+               if (argl == 6 && buf[iarg + 1] == '0')
+                       break;
+               if (argl == 5 && buf[iarg + 1] == 'D' &&
+                   strchr("89ABCDEF", buf[iarg + 2]) != NULL)
+                       break;
+               if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
+                   + 1 == argl)
+                       rval = ESCAPE_UNICODE;
+               break;
+       default:
+               break;
+       }
+       goto out;
+
+out_sub:
+       iesc = sesc;
+       iarg = sarg;
+       iendarg = sendarg;
+       iend = send;
+       rval = ESCAPE_EXPAND;
+
+out:
+       if (rarg != NULL)
+               *rarg = iarg;
+       if (rendarg != NULL)
+               *rendarg = iendarg;
+       if (rend != NULL)
+               *rend = iend;
+       if (resc == NULL)
+               return rval;
+
+       /*
+        * Diagnostic messages are only issued when called
+        * from the parser, not when called from the formatters.
+        */
+
+       *resc = iesc;
+       switch (rval) {
+       case ESCAPE_ERROR:
+               err = MANDOCERR_ESC_BAD;
+               break;
+       case ESCAPE_UNSUPP:
+               err = MANDOCERR_ESC_UNSUPP;
+               break;
+       case ESCAPE_UNDEF:
+               if (esc_name == '\\')
+                       return rval;
+               err = MANDOCERR_ESC_UNDEF;
+               break;
+       case ESCAPE_SPECIAL:
+               if (mchars_spec2cp(buf + iarg, argl) >= 0)
+                       return rval;
+               err = MANDOCERR_ESC_BAD;
+               break;
+       default:
+               return rval;
+       }
+       mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
+       return rval;
+}
diff --git a/usr.bin/mandoc/roff_int.h b/usr.bin/mandoc/roff_int.h

index c2a9246..7bbf4dd 100644 (file)
--- a/usr.bin/mandoc/roff_int.h
+++ b/usr.bin/mandoc/roff_int.h
@@ -1,6 +1,6 @@
-/* $OpenBSD: roff_int.h,v 1.18 2021/10/04 14:18:42 schwarze Exp $      */
+/* $OpenBSD: roff_int.h,v 1.19 2022/05/19 15:17:51 schwarze Exp $      */
  /*
- * Copyright (c) 2013-2015, 2017-2020 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2013-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
   * Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
   *
   * Permission to use, copy, modify, and distribute this software for any
@@ -82,6 +82,8 @@ struct ohash   *roffhash_alloc(enum roff_tok, enum roff_tok);
  enum roff_tok    roffhash_find(struct ohash *, const char *, size_t);
  void             roffhash_free(struct ohash *);
  
+enum mandoc_esc          roff_escape(const char *, const int, const int,
+                       int *, int *, int *, int *);
  void             roff_state_reset(struct roff_man *);
  void             roff_validate(struct roff_man *);
author	schwarze <schwarze@openbsd.org>
	Thu, 19 May 2022 15:17:50 +0000 (15:17 +0000)
committer	schwarze <schwarze@openbsd.org>
	Thu, 19 May 2022 15:17:50 +0000 (15:17 +0000)
usr.bin/mandoc/Makefile		patch \| blob \| history
usr.bin/mandoc/mandoc.c		patch \| blob \| history
usr.bin/mandoc/mandoc.h		patch \| blob \| history
usr.bin/mandoc/roff.c		patch \| blob \| history
usr.bin/mandoc/roff_escape.c	[new file with mode: 0644]	patch \| blob
usr.bin/mandoc/roff_int.h		patch \| blob \| history