-NOTE: We are looking for help with a few things:
- https://github.com/libexpat/libexpat/labels/help%20wanted
- If you can help, please get in touch. Thanks!
-
+ __ __ _
+ ___\ \/ /_ __ __ _| |_
+ / _ \\ /| '_ \ / _` | __|
+ | __// \| |_) | (_| | |_
+ \___/_/\_\ .__/ \__,_|\__|
+ |_| XML parser
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!! <blink>Expat is UNDERSTAFFED and WITHOUT FUNDING.</blink> !!
+!! ~~~~~~~~~~~~ !!
+!! The following topics need *additional skilled C developers* to progress !!
+!! in a timely manner or at all (loosely ordered by descending priority): !!
+!! !!
+!! - <blink>fixing a complex non-public security issue</blink>, !!
+!! - teaming up on researching and fixing future security reports and !!
+!! ClusterFuzz findings with few-days-max response times in communication !!
+!! in order to (1) have a sound fix ready before the end of a 90 days !!
+!! grace period and (2) in a sustainable manner, !!
+!! - implementing and auto-testing XML 1.0r5 support !!
+!! (needs discussion before pull requests), !!
+!! - smart ideas on fixing the Autotools CMake files generation issue !!
+!! without breaking CI (needs discussion before pull requests), !!
+!! - the Windows binaries topic (needs requirements engineering first), !!
+!! - pushing migration from `int` to `size_t` further !!
+!! including edge-cases test coverage (needs discussion before anything). !!
+!! !!
+!! For details, please reach out via e-mail to sebastian@pipping.org so we !!
+!! can schedule a voice call on the topic, in English or German. !!
+!! !!
+!! THANK YOU! Sebastian Pipping -- Berlin, 2024-03-09 !!
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+Release 2.6.2 Wed March 13 2024
Security fixes:
#839 #842 CVE-2024-28757 -- Prevent billion laughs attacks with
isolated use of external parsers. Please see the commit
message of commit 1d50b80cf31de87750103656f6eb693746854aa8
for details.
+ Bug fixes:
+ #839 #841 Reject direct parameter entity recursion
+ and avoid the related undefined behavior
+
+ Other changes:
+ #847 Autotools: Fix build for DOCBOOK_TO_MAN containing spaces
+ #837 Add missing #821 and #824 to 2.6.1 change log
+ #838 #843 Version info bumped from 10:1:9 (libexpat*.so.1.9.1)
+ to 10:2:9 (libexpat*.so.1.9.2); see https://verbump.de/
+ for what these numbers do
+
+ Special thanks to:
+ Philippe Antoine
+ Tomas Korbar
+ and
+ Clang UndefinedBehaviorSanitizer
+ OSS-Fuzz / ClusterFuzz
+
+Release 2.6.1 Thu February 29 2024
+ Bug fixes:
+ #817 Make tests independent of CPU speed, and thus more robust
+ #828 #836 Expose billion laughs API with XML_DTD defined and
+ XML_GE undefined, regression from 2.6.0
+
+ Other changes:
+ #829 Hide test-only code behind new internal macro
+ #833 Autotools: Reject expat_config.h.in defining SIZEOF_VOID_P
+ #821 #824 Autotools: Fix "make clean" for case:
+ ./configure --without-docbook && make clean all
+ #819 Address compiler warnings
+ #832 #834 Version info bumped from 10:0:9 (libexpat*.so.1.9.0)
+ to 10:1:9 (libexpat*.so.1.9.1); see https://verbump.de/
+ for what these numbers do
+
+ Infrastructure:
+ #818 CI: Adapt to breaking changes in clang-format
+
+ Special thanks to:
+ David Hall
+ Snild Dolkow
+
Release 2.6.0 Tue February 6 2024
Security fixes:
#789 #814 CVE-2023-52425 -- Fix quadratic runtime issues with big tokens
-/* 628e24d4966bedbd4800f6ed128d06d29703765b4bce12d3b7f099f90f842fc9 (2.6.0+)
+/* 2a14271ad4d35e82bde8ba210b4edb7998794bcbae54deab114046a300f9639a (2.6.2+)
__ __ _
___\ \/ /_ __ __ _| |_
/ _ \\ /| '_ \ / _` | __|
Copyright (c) 2022 Jann Horn <jannh@google.com>
Copyright (c) 2022 Sean McBride <sean@rogue-research.com>
Copyright (c) 2023 Owain Davies <owaind@bath.edu>
- Copyright (c) 2023 Sony Corporation / Snild Dolkow <snild@sony.com>
+ Copyright (c) 2023-2024 Sony Corporation / Snild Dolkow <snild@sony.com>
Licensed under the MIT license:
Permission is hereby granted, free of charge, to any person obtaining
#endif
/* Round up n to be a multiple of sz, where sz is a power of 2. */
-#define ROUND_UP(n, sz) (((n) + ((sz)-1)) & ~((sz)-1))
+#define ROUND_UP(n, sz) (((n) + ((sz) - 1)) & ~((sz) - 1))
/* Do safe (NULL-aware) pointer arithmetic */
#define EXPAT_SAFE_PTR_DIFF(p, q) (((p) && (q)) ? ((p) - (q)) : 0)
it odd, since odd numbers are always relative prime to a power of 2.
*/
#define SECOND_HASH(hash, mask, power) \
- ((((hash) & ~(mask)) >> ((power)-1)) & ((mask) >> 2))
+ ((((hash) & ~(mask)) >> ((power) - 1)) & ((mask) >> 2))
#define PROBE_STEP(hash, mask, power) \
((unsigned char)((SECOND_HASH(hash, mask, power)) | 1))
? 0 \
: ((*((pool)->ptr)++ = c), 1))
-XML_Bool g_reparseDeferralEnabledDefault = XML_TRUE; // write ONLY in runtests.c
-unsigned int g_parseAttempts = 0; // used for testing only
+#if ! defined(XML_TESTING)
+const
+#endif
+ XML_Bool g_reparseDeferralEnabledDefault
+ = XML_TRUE; // write ONLY in runtests.c
+#if defined(XML_TESTING)
+unsigned int g_bytesScanned = 0; // used for testing only
+#endif
struct XML_ParserStruct {
/* The first member must be m_userData so that the XML_GetUserData
return XML_ERROR_NONE;
}
}
- g_parseAttempts += 1;
+#if defined(XML_TESTING)
+ g_bytesScanned += (unsigned)have_now;
+#endif
const enum XML_Error ret = parser->m_processor(parser, start, end, endPtr);
if (ret == XML_ERROR_NONE) {
// if we consumed nothing, remember what we had on this parse attempt.
dtd->keepProcessing = dtd->standalone;
goto endEntityValue;
}
- if (entity->open) {
+ if (entity->open || (entity == parser->m_declEntity)) {
if (enc == parser->m_encoding)
parser->m_eventPtr = entityTextPtr;
result = XML_ERROR_RECURSIVE_ENTITY_REF;
}
END_TEST
+START_TEST(test_recursive_external_parameter_entity_2) {
+ struct TestCase {
+ const char *doc;
+ enum XML_Status expectedStatus;
+ };
+
+ struct TestCase cases[] = {
+ {"<!ENTITY % p1 '%p1;'>", XML_STATUS_ERROR},
+ {"<!ENTITY % p1 '%p1;'>"
+ "<!ENTITY % p1 'first declaration wins'>",
+ XML_STATUS_ERROR},
+ {"<!ENTITY % p1 'first declaration wins'>"
+ "<!ENTITY % p1 '%p1;'>",
+ XML_STATUS_OK},
+ {"<!ENTITY % p1 '%p1;'>", XML_STATUS_OK},
+ };
+
+ for (size_t i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+ const char *const doc = cases[i].doc;
+ const enum XML_Status expectedStatus = cases[i].expectedStatus;
+ set_subtest("%s", doc);
+
+ XML_Parser parser = XML_ParserCreate(NULL);
+ assert_true(parser != NULL);
+
+ XML_Parser ext_parser = XML_ExternalEntityParserCreate(parser, NULL, NULL);
+ assert_true(ext_parser != NULL);
+
+ const enum XML_Status actualStatus
+ = _XML_Parse_SINGLE_BYTES(ext_parser, doc, (int)strlen(doc), XML_TRUE);
+
+ assert_true(actualStatus == expectedStatus);
+ if (actualStatus != XML_STATUS_OK) {
+ assert_true(XML_GetErrorCode(ext_parser)
+ == XML_ERROR_RECURSIVE_ENTITY_REF);
+ }
+
+ XML_ParserFree(ext_parser);
+ XML_ParserFree(parser);
+ }
+}
+END_TEST
+
/* Test incomplete external entities are faulted */
START_TEST(test_ext_entity_invalid_parse) {
const char *text = "<!DOCTYPE doc [\n"
END_TEST
/* Regression test for quadratic parsing on large tokens */
-START_TEST(test_big_tokens_take_linear_time) {
- const char *const too_slow_failure_message
- = "Compared to the baseline runtime of the first test, this test has a "
- "slowdown of more than <max_slowdown>. "
- "Please keep increasing the value by 1 until it reliably passes the "
- "test on your hardware and open a bug sharing that number with us. "
- "Thanks in advance!";
+START_TEST(test_big_tokens_scale_linearly) {
const struct {
const char *pre;
const char *post;
{"<e><", "/></e>"}, // big elem name, used to be O(N²)
};
const int num_cases = sizeof(text) / sizeof(text[0]);
- // For the test we need a <max_slowdown> value that is:
- // (1) big enough that the test passes reliably (avoiding flaky tests), and
- // (2) small enough that the test actually catches regressions.
- const int max_slowdown = 15;
char aaaaaa[4096];
const int fillsize = (int)sizeof(aaaaaa);
const int fillcount = 100;
+ const unsigned approx_bytes = fillsize * fillcount; // ignore pre/post.
+ const unsigned max_factor = 4;
+ const unsigned max_scanned = max_factor * approx_bytes;
memset(aaaaaa, 'a', fillsize);
if (! g_reparseDeferralEnabledDefault) {
return; // heuristic is disabled; we would get O(n^2) and fail.
}
-#if ! defined(__linux__)
- if (CLOCKS_PER_SEC < 100000) {
- // Skip this test if clock() doesn't have reasonably good resolution.
- // This workaround is primarily targeting Windows and FreeBSD, since
- // XSI requires the value to be 1.000.000 (10x the condition here), and
- // we want to be very sure that at least one platform in CI can catch
- // regressions (through a failing test).
- return;
- }
-#endif
- clock_t baseline = 0;
for (int i = 0; i < num_cases; ++i) {
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
enum XML_Status status;
- set_subtest("max_slowdown=%d text=\"%saaaaaa%s\"", max_slowdown,
- text[i].pre, text[i].post);
- const clock_t start = clock();
+ set_subtest("text=\"%saaaaaa%s\"", text[i].pre, text[i].post);
// parse the start text
+ g_bytesScanned = 0;
status = _XML_Parse_SINGLE_BYTES(parser, text[i].pre,
(int)strlen(text[i].pre), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
+
// parse lots of 'a', failing the test early if it takes too long
+ unsigned past_max_count = 0;
for (int f = 0; f < fillcount; ++f) {
status = _XML_Parse_SINGLE_BYTES(parser, aaaaaa, fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
- // i == 0 means we're still calculating the baseline value
- if (i > 0) {
- const clock_t now = clock();
- const clock_t clocks_so_far = now - start;
- const int slowdown = clocks_so_far / baseline;
- if (slowdown >= max_slowdown) {
- fprintf(
- stderr,
- "fill#%d: clocks_so_far=%d baseline=%d slowdown=%d max_slowdown=%d\n",
- f, (int)clocks_so_far, (int)baseline, slowdown, max_slowdown);
- fail(too_slow_failure_message);
- }
+ if (g_bytesScanned > max_scanned) {
+ // We're not done, and have already passed the limit -- the test will
+ // definitely fail. This block allows us to save time by failing early.
+ const unsigned pushed
+ = (unsigned)strlen(text[i].pre) + (f + 1) * fillsize;
+ fprintf(
+ stderr,
+ "after %d/%d loops: pushed=%u scanned=%u (factor ~%.2f) max_scanned: %u (factor ~%u)\n",
+ f + 1, fillcount, pushed, g_bytesScanned,
+ g_bytesScanned / (double)pushed, max_scanned, max_factor);
+ past_max_count++;
+ // We are failing, but allow a few log prints first. If we don't reach
+ // a count of five, the test will fail after the loop instead.
+ assert_true(past_max_count < 5);
}
}
+
// parse the end text
status = _XML_Parse_SINGLE_BYTES(parser, text[i].post,
(int)strlen(text[i].post), XML_TRUE);
xml_failure(parser);
}
- // how long did it take in total?
- const clock_t end = clock();
- const clock_t taken = end - start;
- if (i == 0) {
- assert_true(taken > 0); // just to make sure we don't div-by-0 later
- baseline = taken;
- }
- const int slowdown = taken / baseline;
- if (slowdown >= max_slowdown) {
- fprintf(stderr, "taken=%d baseline=%d slowdown=%d max_slowdown=%d\n",
- (int)taken, (int)baseline, slowdown, max_slowdown);
- fail(too_slow_failure_message);
+ assert_true(g_bytesScanned > approx_bytes); // or the counter isn't working
+ if (g_bytesScanned > max_scanned) {
+ fprintf(
+ stderr,
+ "after all input: scanned=%u (factor ~%.2f) max_scanned: %u (factor ~%u)\n",
+ g_bytesScanned, g_bytesScanned / (double)approx_bytes, max_scanned,
+ max_factor);
+ fail("scanned too many bytes");
}
XML_ParserFree(parser);
fillsize[2], fillsize[3]);
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
- g_parseAttempts = 0;
CharData storage;
CharData_Init(&storage);
XML_SetUserData(parser, &storage);
XML_SetStartElementHandler(parser, start_element_event_handler);
+ g_bytesScanned = 0;
int worstcase_bytes = 0; // sum of (buffered bytes at each XML_Parse call)
- int scanned_bytes = 0; // sum of (buffered bytes at each actual parse)
int offset = 0;
while (*fillsize >= 0) {
assert_true(offset + *fillsize <= document_length); // or test is invalid
- const unsigned attempts_before = g_parseAttempts;
const enum XML_Status status
= XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
fillsize++;
assert_true(offset <= INT_MAX - worstcase_bytes); // avoid overflow
worstcase_bytes += offset; // we might've tried to parse all pending bytes
- if (g_parseAttempts != attempts_before) {
- assert_true(g_parseAttempts == attempts_before + 1); // max 1/XML_Parse
- assert_true(offset <= INT_MAX - scanned_bytes); // avoid overflow
- scanned_bytes += offset; // we *did* try to parse all pending bytes
- }
}
assert_true(storage.count == 1); // the big token should've been parsed
- assert_true(scanned_bytes > 0); // test-the-test: does our counter work?
+ assert_true(g_bytesScanned > 0); // test-the-test: does our counter work?
if (g_reparseDeferralEnabledDefault) {
// heuristic is enabled; some XML_Parse calls may have deferred reparsing
- const int max_bytes_scanned = -*fillsize;
- if (scanned_bytes > max_bytes_scanned) {
+ const unsigned max_bytes_scanned = -*fillsize;
+ if (g_bytesScanned > max_bytes_scanned) {
fprintf(stderr,
- "bytes scanned in parse attempts: actual=%d limit=%d \n",
- scanned_bytes, max_bytes_scanned);
+ "bytes scanned in parse attempts: actual=%u limit=%u \n",
+ g_bytesScanned, max_bytes_scanned);
fail("too many bytes scanned in parse attempts");
}
- assert_true(scanned_bytes <= worstcase_bytes);
- } else {
- // heuristic is disabled; every XML_Parse() will have reparsed
- assert_true(scanned_bytes == worstcase_bytes);
}
+ assert_true(g_bytesScanned <= (unsigned)worstcase_bytes);
XML_ParserFree(parser);
}
tcase_add_test__ifdef_xml_dtd(tc_basic, test_skipped_parameter_entity);
tcase_add_test__ifdef_xml_dtd(tc_basic,
test_recursive_external_parameter_entity);
+ tcase_add_test__ifdef_xml_dtd(tc_basic,
+ test_recursive_external_parameter_entity_2);
tcase_add_test(tc_basic, test_undefined_ext_entity_in_external_dtd);
tcase_add_test(tc_basic, test_suspend_xdecl);
tcase_add_test(tc_basic, test_abort_epilog);
tcase_add_test__ifdef_xml_dtd(tc_basic,
test_pool_integrity_with_unfinished_attr);
tcase_add_test__if_xml_ge(tc_basic, test_nested_entity_suspend);
- tcase_add_test(tc_basic, test_big_tokens_take_linear_time);
+ tcase_add_test(tc_basic, test_big_tokens_scale_linearly);
tcase_add_test(tc_basic, test_set_reparse_deferral);
tcase_add_test(tc_basic, test_reparse_deferral_is_inherited);
tcase_add_test(tc_basic, test_set_reparse_deferral_on_null_parser);