From 4482121fd9e413050419c303cea3e08bbf8ee3a5 Mon Sep 17 00:00:00 2001 From: schwarze Date: Sun, 25 Jun 2017 17:42:37 +0000 Subject: [PATCH] Catch typos in .Sh names; suggested by jmc@. I'm using a very simple, linear time / zero space fuzzy string matching heuristic rather than a full Levenshtein metric, to keep the code both simple and fast. --- usr.bin/mandoc/mandoc.1 | 7 +++- usr.bin/mandoc/mandoc.h | 3 +- usr.bin/mandoc/mdoc_validate.c | 65 ++++++++++++++++++++++++++++++++-- usr.bin/mandoc/read.c | 3 +- 4 files changed, 73 insertions(+), 5 deletions(-) diff --git a/usr.bin/mandoc/mandoc.1 b/usr.bin/mandoc/mandoc.1 index 2df74ea9cd6..931aabfa428 100644 --- a/usr.bin/mandoc/mandoc.1 +++ b/usr.bin/mandoc/mandoc.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: mandoc.1,v 1.130 2017/06/25 07:23:53 bentley Exp $ +.\" $OpenBSD: mandoc.1,v 1.131 2017/06/25 17:42:37 schwarze Exp $ .\" .\" Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons .\" Copyright (c) 2012, 2014-2017 Ingo Schwarze @@ -857,6 +857,11 @@ A single manual page contains two copies of the RCS identifier for the same operating system. Consider deleting the later instance and moving the first one up to the top of the page. +.It Sy "typo in section name" +.Pq mdoc +Fuzzy string matching revealed that the argument of an +.Ic \&Sh +macro is similar, but not identical to a standard section name. .It Sy "useless macro" .Pq mdoc A diff --git a/usr.bin/mandoc/mandoc.h b/usr.bin/mandoc/mandoc.h index 3ffdb5f9c0c..9515490c827 100644 --- a/usr.bin/mandoc/mandoc.h +++ b/usr.bin/mandoc/mandoc.h @@ -1,4 +1,4 @@ -/* $OpenBSD: mandoc.h,v 1.177 2017/06/24 18:58:09 schwarze Exp $ */ +/* $OpenBSD: mandoc.h,v 1.178 2017/06/25 17:42:37 schwarze Exp $ */ /* * Copyright (c) 2010, 2011, 2014 Kristaps Dzonsons * Copyright (c) 2010-2017 Ingo Schwarze @@ -56,6 +56,7 @@ enum mandocerr { MANDOCERR_DATE_LEGACY, /* legacy man(7) date format: Dd ... */ MANDOCERR_RCS_REP, /* duplicate RCS id: ... */ + MANDOCERR_SEC_TYPO, /* typo in section name: Sh ... */ MANDOCERR_MACRO_USELESS, /* useless macro: macro */ MANDOCERR_BX, /* consider using OS macro: macro */ MANDOCERR_ER_ORDER, /* errnos out of order: Er ... */ diff --git a/usr.bin/mandoc/mdoc_validate.c b/usr.bin/mandoc/mdoc_validate.c index 46f0bae730d..0b45e79303b 100644 --- a/usr.bin/mandoc/mdoc_validate.c +++ b/usr.bin/mandoc/mdoc_validate.c @@ -1,4 +1,4 @@ -/* $OpenBSD: mdoc_validate.c,v 1.257 2017/06/24 18:58:09 schwarze Exp $ */ +/* $OpenBSD: mdoc_validate.c,v 1.258 2017/06/25 17:42:37 schwarze Exp $ */ /* * Copyright (c) 2008-2012 Kristaps Dzonsons * Copyright (c) 2010-2017 Ingo Schwarze @@ -58,6 +58,7 @@ static void check_toptext(struct roff_man *, int, int, const char *); static int child_an(const struct roff_node *); static size_t macro2len(enum roff_tok); static void rewrite_macro2len(struct roff_man *, char **); +static int similar(const char *, const char *); static void post_an(POST_ARGS); static void post_an_norm(POST_ARGS); @@ -2133,11 +2134,54 @@ post_sh_authors(POST_ARGS) mdoc->last->line, mdoc->last->pos, NULL); } +/* + * Return an upper bound for the string distance (allowing + * transpositions). Not a full Levenshtein implementation + * because Levenshtein is quadratic in the string length + * and this function is called for every standard name, + * so the check for each custom name would be cubic. + * The following crude heuristics is linear, resulting + * in quadratic behaviour for checking one custom name, + * which does not cause measurable slowdown. + */ +static int +similar(const char *s1, const char *s2) +{ + const int maxdist = 3; + int dist = 0; + + while (s1[0] != '\0' && s2[0] != '\0') { + if (s1[0] == s2[0]) { + s1++; + s2++; + continue; + } + if (++dist > maxdist) + return INT_MAX; + if (s1[1] == s2[1]) { /* replacement */ + s1++; + s2++; + } else if (s1[0] == s2[1] && s1[1] == s2[0]) { + s1 += 2; /* transposition */ + s2 += 2; + } else if (s1[0] == s2[1]) /* insertion */ + s2++; + else if (s1[1] == s2[0]) /* deletion */ + s1++; + else + return INT_MAX; + } + dist += strlen(s1) + strlen(s2); + return dist > maxdist ? INT_MAX : dist; +} + static void post_sh_head(POST_ARGS) { struct roff_node *nch; const char *goodsec; + const char *const *testsec; + int dist, mindist; enum roff_sec sec; /* @@ -2175,8 +2219,25 @@ post_sh_head(POST_ARGS) /* We don't care about custom sections after this. */ - if (sec == SEC_CUSTOM) + if (sec == SEC_CUSTOM) { + if ((nch = mdoc->last->child) == NULL || + nch->type != ROFFT_TEXT || nch->next != NULL) + return; + goodsec = NULL; + mindist = INT_MAX; + for (testsec = secnames + 1; *testsec != NULL; testsec++) { + dist = similar(nch->string, *testsec); + if (dist < mindist) { + goodsec = *testsec; + mindist = dist; + } + } + if (goodsec != NULL) + mandoc_vmsg(MANDOCERR_SEC_TYPO, mdoc->parse, + nch->line, nch->pos, "Sh %s instead of %s", + nch->string, goodsec); return; + } /* * Check whether our non-custom section is being repeated or is diff --git a/usr.bin/mandoc/read.c b/usr.bin/mandoc/read.c index 61466c5de74..55f24b3fdf5 100644 --- a/usr.bin/mandoc/read.c +++ b/usr.bin/mandoc/read.c @@ -1,4 +1,4 @@ -/* $OpenBSD: read.c,v 1.153 2017/06/24 18:58:09 schwarze Exp $ */ +/* $OpenBSD: read.c,v 1.154 2017/06/25 17:42:37 schwarze Exp $ */ /* * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons * Copyright (c) 2010-2017 Ingo Schwarze @@ -94,6 +94,7 @@ static const char * const mandocerrs[MANDOCERR_MAX] = { "legacy man(7) date format", "duplicate RCS id", + "typo in section name", "useless macro", "consider using OS macro", "errnos out of order", -- 2.20.1