commit 678a2f7efcaaa977886e055613f2332615aef82c Author: Tomas Korbar Date: Tue Feb 13 13:52:28 2024 +0100 Fix CVE-2023-52425 diff --git a/expat/Makefile.am b/expat/Makefile.am index 37ae373..cd0117f 100644 --- a/expat/Makefile.am +++ b/expat/Makefile.am @@ -131,6 +131,11 @@ buildlib: run-benchmark: $(MAKE) -C tests/benchmark ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/recset.xml 65535 3 + ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_attr.xml 4096 3 + ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_cdata.xml 4096 3 + ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_comment.xml 4096 3 + ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_tag.xml 4096 3 + ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_text.xml 4096 3 .PHONY: download-xmlts-zip download-xmlts-zip: diff --git a/expat/doc/reference.html b/expat/doc/reference.html index 8b0d47d..a10f3cb 100644 --- a/expat/doc/reference.html +++ b/expat/doc/reference.html @@ -151,10 +151,11 @@ interface.

  • - Billion Laughs Attack Protection + Attack Protection
  • Miscellaneous Functions @@ -2096,11 +2097,7 @@ parse position may be before the beginning of the buffer.

    return NULL.

    -

    Billion Laughs Attack Protection

    - -

    The functions in this section configure the built-in - protection against various forms of - billion laughs attacks.

    +

    Attack Protection

    XML_SetBillionLaughsAttackProtectionMaximumAmplification

    @@ -2188,6 +2185,27 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(XML_Parser p,
       

    +

    XML_SetReparseDeferralEnabled

    +
    +/* Added in Expat 2.6.0. */
    +XML_Bool XMLCALL
    +XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled);
    +
    +
    +

    + Large tokens may require many parse calls before enough data is available for Expat to parse it in full. + If Expat retried parsing the token on every parse call, parsing could take quadratic time. + To avoid this, Expat only retries once a significant amount of new data is available. + This function allows disabling this behavior. +

    +

    + The enabled argument should be XML_TRUE or XML_FALSE. +

    +

    + Returns XML_TRUE on success, and XML_FALSE on error. +

    +
    +

    Miscellaneous functions

    The functions in this section either obtain state information from diff --git a/expat/doc/xmlwf.xml b/expat/doc/xmlwf.xml index 9603abf..3d35393 100644 --- a/expat/doc/xmlwf.xml +++ b/expat/doc/xmlwf.xml @@ -313,6 +313,16 @@ supports both. + + + + + Disable reparse deferral, and allow quadratic parse runtime + on large tokens (default: reparse deferral enabled). + + + + diff --git a/expat/lib/expat.h b/expat/lib/expat.h index 1c83563..842dd70 100644 --- a/expat/lib/expat.h +++ b/expat/lib/expat.h @@ -16,6 +16,7 @@ Copyright (c) 2016 Thomas Beutlich Copyright (c) 2017 Rhodri James Copyright (c) 2022 Thijs Schreijer + Copyright (c) 2023 Sony Corporation / Snild Dolkow Licensed under the MIT license: Permission is hereby granted, free of charge, to any person obtaining @@ -1050,6 +1051,10 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold( XML_Parser parser, unsigned long long activationThresholdBytes); #endif +/* Added in Expat 2.6.0. */ +XMLPARSEAPI(XML_Bool) +XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled); + /* Expat follows the semantic versioning convention. See http://semver.org. */ diff --git a/expat/lib/internal.h b/expat/lib/internal.h index e09f533..e2709c8 100644 --- a/expat/lib/internal.h +++ b/expat/lib/internal.h @@ -31,6 +31,7 @@ Copyright (c) 2016-2022 Sebastian Pipping Copyright (c) 2018 Yury Gribov Copyright (c) 2019 David Loffredo + Copyright (c) 2023 Sony Corporation / Snild Dolkow Licensed under the MIT license: Permission is hereby granted, free of charge, to any person obtaining @@ -160,6 +161,9 @@ unsigned long long testingAccountingGetCountBytesIndirect(XML_Parser parser); const char *unsignedCharToPrintable(unsigned char c); #endif +extern XML_Bool g_reparseDeferralEnabledDefault; // written ONLY in runtests.c +extern unsigned int g_parseAttempts; // used for testing only + #ifdef __cplusplus } #endif diff --git a/expat/lib/libexpat.def.cmake b/expat/lib/libexpat.def.cmake index cf434a2..3ff4d55 100644 --- a/expat/lib/libexpat.def.cmake +++ b/expat/lib/libexpat.def.cmake @@ -77,3 +77,4 @@ EXPORTS ; added with version 2.4.0 @_EXPAT_COMMENT_DTD@ XML_SetBillionLaughsAttackProtectionActivationThreshold @69 @_EXPAT_COMMENT_DTD@ XML_SetBillionLaughsAttackProtectionMaximumAmplification @70 +XML_SetReparseDeferralEnabled @71 diff --git a/expat/lib/xmlparse.c b/expat/lib/xmlparse.c index b6c2eca..2ae64e9 100644 --- a/expat/lib/xmlparse.c +++ b/expat/lib/xmlparse.c @@ -73,6 +73,7 @@ # endif #endif +#include #include #include /* memset(), memcpy() */ #include @@ -196,6 +197,8 @@ typedef char ICHAR; /* Do safe (NULL-aware) pointer arithmetic */ #define EXPAT_SAFE_PTR_DIFF(p, q) (((p) && (q)) ? ((p) - (q)) : 0) +#define EXPAT_MIN(a, b) (((a) < (b)) ? (a) : (b)) + #include "internal.h" #include "xmltok.h" #include "xmlrole.h" @@ -602,6 +605,9 @@ static unsigned long getDebugLevel(const char *variableName, ? 0 \ : ((*((pool)->ptr)++ = c), 1)) +XML_Bool g_reparseDeferralEnabledDefault = XML_TRUE; // write ONLY in runtests.c +unsigned int g_parseAttempts = 0; // used for testing only + struct XML_ParserStruct { /* The first member must be m_userData so that the XML_GetUserData macro works. */ @@ -617,6 +623,9 @@ struct XML_ParserStruct { const char *m_bufferLim; XML_Index m_parseEndByteIndex; const char *m_parseEndPtr; + size_t m_partialTokenBytesBefore; /* used in heuristic to avoid O(n^2) */ + XML_Bool m_reparseDeferralEnabled; + int m_lastBufferRequestSize; XML_Char *m_dataBuf; XML_Char *m_dataBufEnd; XML_StartElementHandler m_startElementHandler; @@ -948,6 +957,47 @@ get_hash_secret_salt(XML_Parser parser) { return parser->m_hash_secret_salt; } +static enum XML_Error +callProcessor(XML_Parser parser, const char *start, const char *end, + const char **endPtr) { + const size_t have_now = EXPAT_SAFE_PTR_DIFF(end, start); + + if (parser->m_reparseDeferralEnabled + && ! parser->m_parsingStatus.finalBuffer) { + // Heuristic: don't try to parse a partial token again until the amount of + // available data has increased significantly. + const size_t had_before = parser->m_partialTokenBytesBefore; + // ...but *do* try anyway if we're close to causing a reallocation. + size_t available_buffer + = EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer); +#if XML_CONTEXT_BYTES > 0 + available_buffer -= EXPAT_MIN(available_buffer, XML_CONTEXT_BYTES); +#endif + available_buffer + += EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd); + // m_lastBufferRequestSize is never assigned a value < 0, so the cast is ok + const bool enough + = (have_now >= 2 * had_before) + || ((size_t)parser->m_lastBufferRequestSize > available_buffer); + + if (! enough) { + *endPtr = start; // callers may expect this to be set + return XML_ERROR_NONE; + } + } + g_parseAttempts += 1; + const enum XML_Error ret = parser->m_processor(parser, start, end, endPtr); + if (ret == XML_ERROR_NONE) { + // if we consumed nothing, remember what we had on this parse attempt. + if (*endPtr == start) { + parser->m_partialTokenBytesBefore = have_now; + } else { + parser->m_partialTokenBytesBefore = 0; + } + } + return ret; +} + static XML_Bool /* only valid for root parser */ startParsing(XML_Parser parser) { /* hash functions must be initialized before setContext() is called */ @@ -1129,6 +1179,9 @@ parserInit(XML_Parser parser, const XML_Char *encodingName) { parser->m_bufferEnd = parser->m_buffer; parser->m_parseEndByteIndex = 0; parser->m_parseEndPtr = NULL; + parser->m_partialTokenBytesBefore = 0; + parser->m_reparseDeferralEnabled = g_reparseDeferralEnabledDefault; + parser->m_lastBufferRequestSize = 0; parser->m_declElementType = NULL; parser->m_declAttributeId = NULL; parser->m_declEntity = NULL; @@ -1298,6 +1351,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser, const XML_Char *context, to worry which hash secrets each table has. */ unsigned long oldhash_secret_salt; + XML_Bool oldReparseDeferralEnabled; /* Validate the oldParser parameter before we pull everything out of it */ if (oldParser == NULL) @@ -1342,6 +1396,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser, const XML_Char *context, to worry which hash secrets each table has. */ oldhash_secret_salt = parser->m_hash_secret_salt; + oldReparseDeferralEnabled = parser->m_reparseDeferralEnabled; #ifdef XML_DTD if (! context) @@ -1394,6 +1449,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser, const XML_Char *context, parser->m_defaultExpandInternalEntities = oldDefaultExpandInternalEntities; parser->m_ns_triplets = oldns_triplets; parser->m_hash_secret_salt = oldhash_secret_salt; + parser->m_reparseDeferralEnabled = oldReparseDeferralEnabled; parser->m_parentParser = oldParser; #ifdef XML_DTD parser->m_paramEntityParsing = oldParamEntityParsing; @@ -1848,55 +1904,8 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) { parser->m_parsingStatus.parsing = XML_PARSING; } - if (len == 0) { - parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal; - if (! isFinal) - return XML_STATUS_OK; - parser->m_positionPtr = parser->m_bufferPtr; - parser->m_parseEndPtr = parser->m_bufferEnd; - - /* If data are left over from last buffer, and we now know that these - data are the final chunk of input, then we have to check them again - to detect errors based on that fact. - */ - parser->m_errorCode - = parser->m_processor(parser, parser->m_bufferPtr, - parser->m_parseEndPtr, &parser->m_bufferPtr); - - if (parser->m_errorCode == XML_ERROR_NONE) { - switch (parser->m_parsingStatus.parsing) { - case XML_SUSPENDED: - /* It is hard to be certain, but it seems that this case - * cannot occur. This code is cleaning up a previous parse - * with no new data (since len == 0). Changing the parsing - * state requires getting to execute a handler function, and - * there doesn't seem to be an opportunity for that while in - * this circumstance. - * - * Given the uncertainty, we retain the code but exclude it - * from coverage tests. - * - * LCOV_EXCL_START - */ - XmlUpdatePosition(parser->m_encoding, parser->m_positionPtr, - parser->m_bufferPtr, &parser->m_position); - parser->m_positionPtr = parser->m_bufferPtr; - return XML_STATUS_SUSPENDED; - /* LCOV_EXCL_STOP */ - case XML_INITIALIZED: - case XML_PARSING: - parser->m_parsingStatus.parsing = XML_FINISHED; - /* fall through */ - default: - return XML_STATUS_OK; - } - } - parser->m_eventEndPtr = parser->m_eventPtr; - parser->m_processor = errorProcessor; - return XML_STATUS_ERROR; - } #ifndef XML_CONTEXT_BYTES - else if (parser->m_bufferPtr == parser->m_bufferEnd) { + if (parser->m_bufferPtr == parser->m_bufferEnd) { const char *end; int nLeftOver; enum XML_Status result; @@ -1907,12 +1916,15 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) { parser->m_processor = errorProcessor; return XML_STATUS_ERROR; } + // though this isn't a buffer request, we assume that `len` is the app's + // preferred buffer fill size, and therefore save it here. + parser->m_lastBufferRequestSize = len; parser->m_parseEndByteIndex += len; parser->m_positionPtr = s; parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal; parser->m_errorCode - = parser->m_processor(parser, s, parser->m_parseEndPtr = s + len, &end); + = callProcessor(parser, s, parser->m_parseEndPtr = s + len, &end); if (parser->m_errorCode != XML_ERROR_NONE) { parser->m_eventEndPtr = parser->m_eventPtr; @@ -1939,23 +1951,25 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) { &parser->m_position); nLeftOver = s + len - end; if (nLeftOver) { - if (parser->m_buffer == NULL - || nLeftOver > parser->m_bufferLim - parser->m_buffer) { - /* avoid _signed_ integer overflow */ - char *temp = NULL; - const int bytesToAllocate = (int)((unsigned)len * 2U); - if (bytesToAllocate > 0) { - temp = (char *)REALLOC(parser, parser->m_buffer, bytesToAllocate); - } - if (temp == NULL) { - parser->m_errorCode = XML_ERROR_NO_MEMORY; - parser->m_eventPtr = parser->m_eventEndPtr = NULL; - parser->m_processor = errorProcessor; - return XML_STATUS_ERROR; - } - parser->m_buffer = temp; - parser->m_bufferLim = parser->m_buffer + bytesToAllocate; + // Back up and restore the parsing status to avoid XML_ERROR_SUSPENDED + // (and XML_ERROR_FINISHED) from XML_GetBuffer. + const enum XML_Parsing originalStatus = parser->m_parsingStatus.parsing; + parser->m_parsingStatus.parsing = XML_PARSING; + void *const temp = XML_GetBuffer(parser, nLeftOver); + parser->m_parsingStatus.parsing = originalStatus; + // GetBuffer may have overwritten this, but we want to remember what the + // app requested, not how many bytes were left over after parsing. + parser->m_lastBufferRequestSize = len; + if (temp == NULL) { + // NOTE: parser->m_errorCode has already been set by XML_GetBuffer(). + parser->m_eventPtr = parser->m_eventEndPtr = NULL; + parser->m_processor = errorProcessor; + return XML_STATUS_ERROR; } + // Since we know that the buffer was empty and XML_CONTEXT_BYTES is 0, we + // don't have any data to preserve, and can copy straight into the start + // of the buffer rather than the GetBuffer return pointer (which may be + // pointing further into the allocated buffer). memcpy(parser->m_buffer, end, nLeftOver); } parser->m_bufferPtr = parser->m_buffer; @@ -1967,15 +1981,14 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) { return result; } #endif /* not defined XML_CONTEXT_BYTES */ - else { - void *buff = XML_GetBuffer(parser, len); - if (buff == NULL) - return XML_STATUS_ERROR; - else { - memcpy(buff, s, len); - return XML_ParseBuffer(parser, len, isFinal); - } + void *buff = XML_GetBuffer(parser, len); + if (buff == NULL) + return XML_STATUS_ERROR; + if (len > 0) { + assert(s != NULL); // make sure s==NULL && len!=0 was rejected above + memcpy(buff, s, len); } + return XML_ParseBuffer(parser, len, isFinal); } enum XML_Status XMLCALL @@ -2015,8 +2028,8 @@ XML_ParseBuffer(XML_Parser parser, int len, int isFinal) { parser->m_parseEndByteIndex += len; parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal; - parser->m_errorCode = parser->m_processor( - parser, start, parser->m_parseEndPtr, &parser->m_bufferPtr); + parser->m_errorCode = callProcessor(parser, start, parser->m_parseEndPtr, + &parser->m_bufferPtr); if (parser->m_errorCode != XML_ERROR_NONE) { parser->m_eventEndPtr = parser->m_eventPtr; @@ -2061,10 +2074,14 @@ XML_GetBuffer(XML_Parser parser, int len) { default:; } - if (len > EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd)) { -#ifdef XML_CONTEXT_BYTES + // whether or not the request succeeds, `len` seems to be the app's preferred + // buffer fill size; remember it. + parser->m_lastBufferRequestSize = len; + if (len > EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd) + || parser->m_buffer == NULL) { +#if XML_CONTEXT_BYTES > 0 int keep; -#endif /* defined XML_CONTEXT_BYTES */ +#endif /* XML_CONTEXT_BYTES > 0 */ /* Do not invoke signed arithmetic overflow: */ int neededSize = (int)((unsigned)len + (unsigned)EXPAT_SAFE_PTR_DIFF( @@ -2073,7 +2090,7 @@ XML_GetBuffer(XML_Parser parser, int len) { parser->m_errorCode = XML_ERROR_NO_MEMORY; return NULL; } -#ifdef XML_CONTEXT_BYTES +#if XML_CONTEXT_BYTES > 0 keep = (int)EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer); if (keep > XML_CONTEXT_BYTES) keep = XML_CONTEXT_BYTES; @@ -2083,10 +2100,11 @@ XML_GetBuffer(XML_Parser parser, int len) { return NULL; } neededSize += keep; -#endif /* defined XML_CONTEXT_BYTES */ - if (neededSize - <= EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_buffer)) { -#ifdef XML_CONTEXT_BYTES +#endif /* XML_CONTEXT_BYTES > 0 */ + if (parser->m_buffer && parser->m_bufferPtr + && neededSize + <= EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_buffer)) { +#if XML_CONTEXT_BYTES > 0 if (keep < EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer)) { int offset = (int)EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer) @@ -2099,19 +2117,17 @@ XML_GetBuffer(XML_Parser parser, int len) { parser->m_bufferPtr -= offset; } #else - if (parser->m_buffer && parser->m_bufferPtr) { - memmove(parser->m_buffer, parser->m_bufferPtr, - EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr)); - parser->m_bufferEnd - = parser->m_buffer - + EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr); - parser->m_bufferPtr = parser->m_buffer; - } -#endif /* not defined XML_CONTEXT_BYTES */ + memmove(parser->m_buffer, parser->m_bufferPtr, + EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr)); + parser->m_bufferEnd + = parser->m_buffer + + EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr); + parser->m_bufferPtr = parser->m_buffer; +#endif /* XML_CONTEXT_BYTES > 0 */ } else { char *newBuf; int bufferSize - = (int)EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferPtr); + = (int)EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_buffer); if (bufferSize == 0) bufferSize = INIT_BUFFER_SIZE; do { @@ -2128,7 +2144,7 @@ XML_GetBuffer(XML_Parser parser, int len) { return NULL; } parser->m_bufferLim = newBuf + bufferSize; -#ifdef XML_CONTEXT_BYTES +#if XML_CONTEXT_BYTES > 0 if (parser->m_bufferPtr) { memcpy(newBuf, &parser->m_bufferPtr[-keep], EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr) @@ -2158,7 +2174,7 @@ XML_GetBuffer(XML_Parser parser, int len) { parser->m_bufferEnd = newBuf; } parser->m_bufferPtr = parser->m_buffer = newBuf; -#endif /* not defined XML_CONTEXT_BYTES */ +#endif /* XML_CONTEXT_BYTES > 0 */ } parser->m_eventPtr = parser->m_eventEndPtr = NULL; parser->m_positionPtr = NULL; @@ -2208,7 +2224,7 @@ XML_ResumeParser(XML_Parser parser) { } parser->m_parsingStatus.parsing = XML_PARSING; - parser->m_errorCode = parser->m_processor( + parser->m_errorCode = callProcessor( parser, parser->m_bufferPtr, parser->m_parseEndPtr, &parser->m_bufferPtr); if (parser->m_errorCode != XML_ERROR_NONE) { @@ -2561,6 +2577,15 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold( } #endif /* XML_DTD */ +XML_Bool XMLCALL +XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled) { + if (parser != NULL && (enabled == XML_TRUE || enabled == XML_FALSE)) { + parser->m_reparseDeferralEnabled = enabled; + return XML_TRUE; + } + return XML_FALSE; +} + /* Initially tag->rawName always points into the parse buffer; for those TAG instances opened while the current parse buffer was processed, and not yet closed, we need to store tag->rawName in a more @@ -4482,15 +4507,15 @@ entityValueInitProcessor(XML_Parser parser, const char *s, const char *end, parser->m_processor = entityValueProcessor; return entityValueProcessor(parser, next, end, nextPtr); } - /* If we are at the end of the buffer, this would cause XmlPrologTok to - return XML_TOK_NONE on the next call, which would then cause the - function to exit with *nextPtr set to s - that is what we want for other - tokens, but not for the BOM - we would rather like to skip it; - then, when this routine is entered the next time, XmlPrologTok will - return XML_TOK_INVALID, since the BOM is still in the buffer + /* XmlPrologTok has now set the encoding based on the BOM it found, and we + must move s and nextPtr forward to consume the BOM. + + If we didn't, and got XML_TOK_NONE from the next XmlPrologTok call, we + would leave the BOM in the buffer and return. On the next call to this + function, our XmlPrologTok call would return XML_TOK_INVALID, since it + is not valid to have multiple BOMs. */ - else if (tok == XML_TOK_BOM && next == end - && ! parser->m_parsingStatus.finalBuffer) { + else if (tok == XML_TOK_BOM) { # ifdef XML_DTD if (! accountingDiffTolerated(parser, tok, s, next, __LINE__, XML_ACCOUNT_DIRECT)) { @@ -4500,7 +4525,7 @@ entityValueInitProcessor(XML_Parser parser, const char *s, const char *end, # endif *nextPtr = next; - return XML_ERROR_NONE; + s = next; } /* If we get this token, we have the start of what might be a normal tag, but not a declaration (i.e. it doesn't begin with diff --git a/expat/tests/minicheck.c b/expat/tests/minicheck.c index 1c65748..f383380 100644 --- a/expat/tests/minicheck.c +++ b/expat/tests/minicheck.c @@ -208,6 +208,21 @@ srunner_run_all(SRunner *runner, int verbosity) { } } +void +_fail(const char *file, int line, const char *msg) { + /* Always print the error message so it isn't lost. In this case, + we have a failure, so there's no reason to be quiet about what + it is. + */ + _check_current_filename = file; + _check_current_lineno = line; + if (msg != NULL) { + const int has_newline = (msg[strlen(msg) - 1] == '\n'); + fprintf(stderr, "ERROR: %s%s", msg, has_newline ? "" : "\n"); + } + longjmp(env, 1); +} + void _fail_unless(int condition, const char *file, int line, const char *msg) { /* Always print the error message so it isn't lost. In this case, diff --git a/expat/tests/minicheck.h b/expat/tests/minicheck.h index cc1f835..032b54e 100644 --- a/expat/tests/minicheck.h +++ b/expat/tests/minicheck.h @@ -64,7 +64,14 @@ extern "C" { } \ } -#define fail(msg) _fail_unless(0, __FILE__, __LINE__, msg) + +# define fail(msg) _fail(__FILE__, __LINE__, msg) +# define assert_true(cond) \ + do { \ + if (! (cond)) { \ + _fail(__FILE__, __LINE__, "check failed: " #cond); \ + } \ + } while (0) typedef void (*tcase_setup_function)(void); typedef void (*tcase_teardown_function)(void); @@ -103,6 +110,11 @@ void _check_set_test_info(char const *function, char const *filename, * Prototypes for the actual implementation. */ +# if defined(__GNUC__) +__attribute__((noreturn)) +# endif +void +_fail(const char *file, int line, const char *msg); void _fail_unless(int condition, const char *file, int line, const char *msg); Suite *suite_create(const char *name); TCase *tcase_create(const char *name); diff --git a/expat/tests/runtests.c b/expat/tests/runtests.c index 915fa52..941f61d 100644 --- a/expat/tests/runtests.c +++ b/expat/tests/runtests.c @@ -54,6 +54,7 @@ #include #include #include /* intptr_t uint64_t */ +#include #if ! defined(__cplusplus) # include @@ -1071,7 +1072,7 @@ START_TEST(test_column_number_after_parse) { const char *text = ""; XML_Size colno; - if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_FALSE) + if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_TRUE) == XML_STATUS_ERROR) xml_failure(g_parser); colno = XML_GetCurrentColumnNumber(g_parser); @@ -2582,7 +2583,7 @@ START_TEST(test_default_current) { if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_TRUE) == XML_STATUS_ERROR) xml_failure(g_parser); - CharData_CheckXMLChars(&storage, XCS("DCDCDCDCDCDD")); + CharData_CheckXMLChars(&storage, XCS("DCDCDCDD")); /* Again, without the defaulting */ XML_ParserReset(g_parser, NULL); @@ -2593,7 +2594,7 @@ START_TEST(test_default_current) { if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_TRUE) == XML_STATUS_ERROR) xml_failure(g_parser); - CharData_CheckXMLChars(&storage, XCS("DcccccD")); + CharData_CheckXMLChars(&storage, XCS("DcccD")); /* Now with an internal entity to complicate matters */ XML_ParserReset(g_parser, NULL); @@ -3946,6 +3947,19 @@ START_TEST(test_get_buffer_3_overflow) { END_TEST #endif // defined(XML_CONTEXT_BYTES) +START_TEST(test_getbuffer_allocates_on_zero_len) { + for (int first_len = 1; first_len >= 0; first_len--) { + XML_Parser parser = XML_ParserCreate(NULL); + assert_true(parser != NULL); + assert_true(XML_GetBuffer(parser, first_len) != NULL); + assert_true(XML_GetBuffer(parser, 0) != NULL); + if (XML_ParseBuffer(parser, 0, XML_FALSE) != XML_STATUS_OK) + xml_failure(parser); + XML_ParserFree(parser); + } +} +END_TEST + /* Test position information macros */ START_TEST(test_byte_info_at_end) { const char *text = ""; @@ -6205,6 +6219,12 @@ START_TEST(test_utf8_in_start_tags) { char doc[1024]; size_t failCount = 0; + // we need all the bytes to be parsed, but we don't want the errors that can + // trigger on isFinal=XML_TRUE, so we skip the test if the heuristic is on. + if (g_reparseDeferralEnabledDefault) { + return; + } + for (; i < sizeof(cases) / sizeof(cases[0]); i++) { size_t j = 0; for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) { @@ -6830,6 +6850,613 @@ START_TEST(test_nested_entity_suspend) { } END_TEST +/* Regression test for quadratic parsing on large tokens */ +START_TEST(test_big_tokens_take_linear_time) { + const char *const too_slow_failure_message + = "Compared to the baseline runtime of the first test, this test has a " + "slowdown of more than . " + "Please keep increasing the value by 1 until it reliably passes the " + "test on your hardware and open a bug sharing that number with us. " + "Thanks in advance!"; + const struct { + const char *pre; + const char *post; + } text[] = { + {"", ""}, // assumed good, used as baseline + {""}, // CDATA, performed OK before patch + {""}, // big attribute, used to be O(N²) + {""}, // long comment, used to be O(N²) + {"<", "/>"}, // big elem name, used to be O(N²) + }; + const int num_cases = sizeof(text) / sizeof(text[0]); + // For the test we need a value that is: + // (1) big enough that the test passes reliably (avoiding flaky tests), and + // (2) small enough that the test actually catches regressions. + const int max_slowdown = 15; + char aaaaaa[4096]; + const int fillsize = (int)sizeof(aaaaaa); + const int fillcount = 100; + + memset(aaaaaa, 'a', fillsize); + + if (! g_reparseDeferralEnabledDefault) { + return; // heuristic is disabled; we would get O(n^2) and fail. + } +#if defined(_WIN32) + if (CLOCKS_PER_SEC < 100000) { + // Skip this test if clock() doesn't have reasonably good resolution. + // This workaround is only applied to Windows targets, since XSI requires + // the value to be 1 000 000 (10x the condition here), and we want to be + // very sure that at least one platform in CI can catch regressions. + return; + } +#endif + + clock_t baseline = 0; + for (int i = 0; i < num_cases; ++i) { + XML_Parser parser = XML_ParserCreate(NULL); + assert_true(parser != NULL); + enum XML_Status status; + const clock_t start = clock(); + + // parse the start text + status = _XML_Parse_SINGLE_BYTES(parser, text[i].pre, + (int)strlen(text[i].pre), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + // parse lots of 'a', failing the test early if it takes too long + for (int f = 0; f < fillcount; ++f) { + status = _XML_Parse_SINGLE_BYTES(parser, aaaaaa, fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + // i == 0 means we're still calculating the baseline value + if (i > 0) { + const clock_t now = clock(); + const clock_t clocks_so_far = now - start; + const int slowdown = clocks_so_far / baseline; + if (slowdown >= max_slowdown) { + fprintf( + stderr, + "fill#%d: clocks_so_far=%d baseline=%d slowdown=%d max_slowdown=%d\n", + f, (int)clocks_so_far, (int)baseline, slowdown, max_slowdown); + fail(too_slow_failure_message); + } + } + } + // parse the end text + status = _XML_Parse_SINGLE_BYTES(parser, text[i].post, + (int)strlen(text[i].post), XML_TRUE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + + // how long did it take in total? + const clock_t end = clock(); + const clock_t taken = end - start; + if (i == 0) { + assert_true(taken > 0); // just to make sure we don't div-by-0 later + baseline = taken; + } + const int slowdown = taken / baseline; + if (slowdown >= max_slowdown) { + fprintf(stderr, "taken=%d baseline=%d slowdown=%d max_slowdown=%d\n", + (int)taken, (int)baseline, slowdown, max_slowdown); + fail(too_slow_failure_message); + } + + XML_ParserFree(parser); + } +} +END_TEST + +START_TEST(test_set_reparse_deferral) { + const char *const pre = ""; + const char *const start = ""; + char eeeeee[100]; + const int fillsize = (int)sizeof(eeeeee); + memset(eeeeee, 'e', fillsize); + + for (int enabled = 0; enabled <= 1; enabled += 1) { + + XML_Parser parser = XML_ParserCreate(NULL); + assert_true(parser != NULL); + assert_true(XML_SetReparseDeferralEnabled(parser, enabled)); + // pre-grow the buffer to avoid reparsing due to almost-fullness + assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL); + + CharData storage; + CharData_Init(&storage); + XML_SetUserData(parser, &storage); + XML_SetStartElementHandler(parser, start_element_event_handler); + + enum XML_Status status; + // parse the start text + status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + CharData_CheckXMLChars(&storage, XCS("d")); // first element should be done + + // ..and the start of the token + status = XML_Parse(parser, start, (int)strlen(start), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + CharData_CheckXMLChars(&storage, XCS("d")); // still just the first one + + // try to parse lots of 'e', but the token isn't finished + for (int c = 0; c < 100; ++c) { + status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + } + CharData_CheckXMLChars(&storage, XCS("d")); // *still* just the first one + + // end the token. + status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + + if (enabled) { + // In general, we may need to push more data to trigger a reparse attempt, + // but in this test, the data is constructed to always require it. + CharData_CheckXMLChars(&storage, XCS("d")); // or the test is incorrect + // 2x the token length should suffice; the +1 covers the start and end. + for (int c = 0; c < 101; ++c) { + status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + } + } + CharData_CheckXMLChars(&storage, XCS("dx")); // the should be done + + XML_ParserFree(parser); + } +} +END_TEST + +struct element_decl_data { + XML_Parser parser; + int count; +}; + +static void +element_decl_counter(void *userData, const XML_Char *name, XML_Content *model) { + UNUSED_P(name); + struct element_decl_data *testdata = (struct element_decl_data *)userData; + testdata->count += 1; + XML_FreeContentModel(testdata->parser, model); +} + +static int +external_inherited_parser(XML_Parser p, const XML_Char *context, + const XML_Char *base, const XML_Char *systemId, + const XML_Char *publicId) { + UNUSED_P(base); + UNUSED_P(systemId); + UNUSED_P(publicId); + const char *const pre = "\n"; + const char *const start = "\n"; + const char *const post = "\n"; + const int enabled = *(int *)XML_GetUserData(p); + char eeeeee[100]; + char spaces[100]; + const int fillsize = (int)sizeof(eeeeee); + assert_true(fillsize == (int)sizeof(spaces)); + memset(eeeeee, 'e', fillsize); + memset(spaces, ' ', fillsize); + + XML_Parser parser = XML_ExternalEntityParserCreate(p, context, NULL); + assert_true(parser != NULL); + // pre-grow the buffer to avoid reparsing due to almost-fullness + assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL); + + struct element_decl_data testdata; + testdata.parser = parser; + testdata.count = 0; + XML_SetUserData(parser, &testdata); + XML_SetElementDeclHandler(parser, element_decl_counter); + + enum XML_Status status; + // parse the initial text + status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + assert_true(testdata.count == 1); // first element should be done + + // ..and the start of the big token + status = XML_Parse(parser, start, (int)strlen(start), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + assert_true(testdata.count == 1); // still just the first one + + // try to parse lots of 'e', but the token isn't finished + for (int c = 0; c < 100; ++c) { + status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + } + assert_true(testdata.count == 1); // *still* just the first one + + // end the big token. + status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + + if (enabled) { + // In general, we may need to push more data to trigger a reparse attempt, + // but in this test, the data is constructed to always require it. + assert_true(testdata.count == 1); // or the test is incorrect + // 2x the token length should suffice; the +1 covers the start and end. + for (int c = 0; c < 101; ++c) { + status = XML_Parse(parser, spaces, fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + } + } + assert_true(testdata.count == 2); // the big token should be done + + // parse the final text + status = XML_Parse(parser, post, (int)strlen(post), XML_TRUE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + assert_true(testdata.count == 3); // after isFinal=XML_TRUE, all must be done + + XML_ParserFree(parser); + return XML_STATUS_OK; +} + +START_TEST(test_reparse_deferral_is_inherited) { + const char *const text + = ""; + for (int enabled = 0; enabled <= 1; ++enabled) { + + XML_Parser parser = XML_ParserCreate(NULL); + assert_true(parser != NULL); + XML_SetUserData(parser, (void *)&enabled); + XML_SetParamEntityParsing(parser, XML_PARAM_ENTITY_PARSING_ALWAYS); + // this handler creates a sub-parser and checks that its deferral behavior + // is what we expected, based on the value of `enabled` (in userdata). + XML_SetExternalEntityRefHandler(parser, external_inherited_parser); + assert_true(XML_SetReparseDeferralEnabled(parser, enabled)); + if (XML_Parse(parser, text, (int)strlen(text), XML_TRUE) != XML_STATUS_OK) + xml_failure(parser); + + XML_ParserFree(parser); + } +} +END_TEST + +START_TEST(test_set_reparse_deferral_on_null_parser) { + assert_true(XML_SetReparseDeferralEnabled(NULL, 0) == XML_FALSE); + assert_true(XML_SetReparseDeferralEnabled(NULL, 1) == XML_FALSE); + assert_true(XML_SetReparseDeferralEnabled(NULL, 10) == XML_FALSE); + assert_true(XML_SetReparseDeferralEnabled(NULL, 100) == XML_FALSE); + assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MIN) + == XML_FALSE); + assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MAX) + == XML_FALSE); +} +END_TEST + +START_TEST(test_set_reparse_deferral_on_the_fly) { + const char *const pre = ""; + char iiiiii[100]; + const int fillsize = (int)sizeof(iiiiii); + memset(iiiiii, 'i', fillsize); + + XML_Parser parser = XML_ParserCreate(NULL); + assert_true(parser != NULL); + assert_true(XML_SetReparseDeferralEnabled(parser, XML_TRUE)); + + CharData storage; + CharData_Init(&storage); + XML_SetUserData(parser, &storage); + XML_SetStartElementHandler(parser, start_element_event_handler); + + enum XML_Status status; + // parse the start text + status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + CharData_CheckXMLChars(&storage, XCS("d")); // first element should be done + + // try to parse some 'i', but the token isn't finished + status = XML_Parse(parser, iiiiii, fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + CharData_CheckXMLChars(&storage, XCS("d")); // *still* just the first one + + // end the token. + status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + CharData_CheckXMLChars(&storage, XCS("d")); // not yet. + + // now change the heuristic setting and add *no* data + assert_true(XML_SetReparseDeferralEnabled(parser, XML_FALSE)); + // we avoid isFinal=XML_TRUE, because that would force-bypass the heuristic. + status = XML_Parse(parser, "", 0, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + CharData_CheckXMLChars(&storage, XCS("dx")); + + XML_ParserFree(parser); +} +END_TEST + +START_TEST(test_set_bad_reparse_option) { + XML_Parser parser = XML_ParserCreate(NULL); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 2)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 3)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 99)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 127)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 128)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 129)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 255)); + assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 0)); + assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 1)); + XML_ParserFree(parser); +} +END_TEST + +static size_t g_totalAlloc = 0; +static size_t g_biggestAlloc = 0; + +static void * +counting_realloc(void *ptr, size_t size) { + g_totalAlloc += size; + if (size > g_biggestAlloc) { + g_biggestAlloc = size; + } + return realloc(ptr, size); +} + +static void * +counting_malloc(size_t size) { + return counting_realloc(NULL, size); +} + +START_TEST(test_bypass_heuristic_when_close_to_bufsize) { + if (! g_reparseDeferralEnabledDefault) { + return; // this test is irrelevant when the deferral heuristic is disabled. + } + + const int document_length = 65536; + char *const document = (char *)malloc(document_length); + + const XML_Memory_Handling_Suite memfuncs = { + counting_malloc, + counting_realloc, + free, + }; + + const int leading_list[] = {0, 3, 61, 96, 400, 401, 4000, 4010, 4099, -1}; + const int bigtoken_list[] = {3000, 4000, 4001, 4096, 4099, 5000, 20000, -1}; + const int fillsize_list[] = {131, 256, 399, 400, 401, 1025, 4099, 4321, -1}; + + for (const int *leading = leading_list; *leading >= 0; leading++) { + for (const int *bigtoken = bigtoken_list; *bigtoken >= 0; bigtoken++) { + for (const int *fillsize = fillsize_list; *fillsize >= 0; fillsize++) { + // start by checking that the test looks reasonably valid + assert_true(*leading + *bigtoken <= document_length); + + // put 'x' everywhere; some will be overwritten by elements. + memset(document, 'x', document_length); + // maybe add an initial tag + if (*leading) { + assert_true(*leading >= 3); // or the test case is invalid + memcpy(document, "", 3); + } + // add the large token + document[*leading + 0] = '<'; + document[*leading + 1] = 'b'; + memset(&document[*leading + 2], ' ', *bigtoken - 2); // a spacy token + document[*leading + *bigtoken - 1] = '>'; + + // 1 for 'b', plus 1 or 0 depending on the presence of 'a' + const int expected_elem_total = 1 + (*leading ? 1 : 0); + + XML_Parser parser = XML_ParserCreate_MM(NULL, &memfuncs, NULL); + assert_true(parser != NULL); + + CharData storage; + CharData_Init(&storage); + XML_SetUserData(parser, &storage); + XML_SetStartElementHandler(parser, start_element_event_handler); + + g_biggestAlloc = 0; + g_totalAlloc = 0; + int offset = 0; + // fill data until the big token is covered (but not necessarily parsed) + while (offset < *leading + *bigtoken) { + assert_true(offset + *fillsize <= document_length); + const enum XML_Status status + = XML_Parse(parser, &document[offset], *fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + offset += *fillsize; + } + // Now, check that we've had a buffer allocation that could fit the + // context bytes and our big token. In order to detect a special case, + // we need to know how many bytes of our big token were included in the + // first push that contained _any_ bytes of the big token: + const int bigtok_first_chunk_bytes = *fillsize - (*leading % *fillsize); + if (bigtok_first_chunk_bytes >= *bigtoken && XML_CONTEXT_BYTES == 0) { + // Special case: we aren't saving any context, and the whole big token + // was covered by a single fill, so Expat may have parsed directly + // from our input pointer, without allocating an internal buffer. + } else if (*leading < XML_CONTEXT_BYTES) { + assert_true(g_biggestAlloc >= *leading + (size_t)*bigtoken); + } else { + assert_true(g_biggestAlloc >= XML_CONTEXT_BYTES + (size_t)*bigtoken); + } + // fill data until the big token is actually parsed + while (storage.count < expected_elem_total) { + const size_t alloc_before = g_totalAlloc; + assert_true(offset + *fillsize <= document_length); + const enum XML_Status status + = XML_Parse(parser, &document[offset], *fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + offset += *fillsize; + // since all the bytes of the big token are already in the buffer, + // the bufsize ceiling should make us finish its parsing without any + // further buffer allocations. We assume that there will be no other + // large allocations in this test. + assert_true(g_totalAlloc - alloc_before < 4096); + } + // test-the-test: was our alloc even called? + assert_true(g_totalAlloc > 0); + // test-the-test: there shouldn't be any extra start elements + assert_true(storage.count == expected_elem_total); + + XML_ParserFree(parser); + } + } + } + free(document); +} +END_TEST + +START_TEST(test_varying_buffer_fills) { + const int KiB = 1024; + const int MiB = 1024 * KiB; + const int document_length = 16 * MiB; + const int big = 7654321; // arbitrarily chosen between 4 and 8 MiB + + char *const document = (char *)malloc(document_length); + assert_true(document != NULL); + memset(document, 'x', document_length); + document[0] = '<'; + document[1] = 't'; + memset(&document[2], ' ', big - 2); // a very spacy token + document[big - 1] = '>'; + + // Each testcase is a list of buffer fill sizes, terminated by a value < 0. + // When reparse deferral is enabled, the final (negated) value is the expected + // maximum number of bytes scanned in parse attempts. + const int testcases[][30] = { + {8 * MiB, -8 * MiB}, + {4 * MiB, 4 * MiB, -12 * MiB}, // try at 4MB, then 8MB = 12 MB total + // zero-size fills shouldn't trigger the bypass + {4 * MiB, 0, 4 * MiB, -12 * MiB}, + {4 * MiB, 0, 0, 4 * MiB, -12 * MiB}, + {4 * MiB, 0, 1 * MiB, 0, 3 * MiB, -12 * MiB}, + // try to hit the buffer ceiling only once (at the end) + {4 * MiB, 2 * MiB, 1 * MiB, 512 * KiB, 256 * KiB, 256 * KiB, -12 * MiB}, + // try to hit the same buffer ceiling multiple times + {4 * MiB + 1, 2 * MiB, 1 * MiB, 512 * KiB, -25 * MiB}, + + // try to hit every ceiling, by always landing 1K shy of the buffer size + {1 * KiB, 2 * KiB, 4 * KiB, 8 * KiB, 16 * KiB, 32 * KiB, 64 * KiB, + 128 * KiB, 256 * KiB, 512 * KiB, 1 * MiB, 2 * MiB, 4 * MiB, -16 * MiB}, + + // try to avoid every ceiling, by always landing 1B past the buffer size + // the normal 2x heuristic threshold still forces parse attempts. + {2 * KiB + 1, // will attempt 2KiB + 1 ==> total 2KiB + 1 + 2 * KiB, 4 * KiB, // will attempt 8KiB + 1 ==> total 10KiB + 2 + 8 * KiB, 16 * KiB, // will attempt 32KiB + 1 ==> total 42KiB + 3 + 32 * KiB, 64 * KiB, // will attempt 128KiB + 1 ==> total 170KiB + 4 + 128 * KiB, 256 * KiB, // will attempt 512KiB + 1 ==> total 682KiB + 5 + 512 * KiB, 1 * MiB, // will attempt 2MiB + 1 ==> total 2M + 682K + 6 + 2 * MiB, 4 * MiB, // will attempt 8MiB + 1 ==> total 10M + 682K + 7 + -(10 * MiB + 682 * KiB + 7)}, + // try to avoid every ceiling again, except on our last fill. + {2 * KiB + 1, // will attempt 2KiB + 1 ==> total 2KiB + 1 + 2 * KiB, 4 * KiB, // will attempt 8KiB + 1 ==> total 10KiB + 2 + 8 * KiB, 16 * KiB, // will attempt 32KiB + 1 ==> total 42KiB + 3 + 32 * KiB, 64 * KiB, // will attempt 128KiB + 1 ==> total 170KiB + 4 + 128 * KiB, 256 * KiB, // will attempt 512KiB + 1 ==> total 682KiB + 5 + 512 * KiB, 1 * MiB, // will attempt 2MiB + 1 ==> total 2M + 682K + 6 + 2 * MiB, 4 * MiB - 1, // will attempt 8MiB ==> total 10M + 682K + 6 + -(10 * MiB + 682 * KiB + 6)}, + + // try to hit ceilings on the way multiple times + {512 * KiB + 1, 256 * KiB, 128 * KiB, 128 * KiB - 1, // 1 MiB buffer + 512 * KiB + 1, 256 * KiB, 128 * KiB, 128 * KiB - 1, // 2 MiB buffer + 1 * MiB + 1, 512 * KiB, 256 * KiB, 256 * KiB - 1, // 4 MiB buffer + 2 * MiB + 1, 1 * MiB, 512 * KiB, // 8 MiB buffer + // we'll make a parse attempt at every parse call + -(45 * MiB + 12)}, + }; + const int testcount = sizeof(testcases) / sizeof(testcases[0]); + for (int test_i = 0; test_i < testcount; test_i++) { + const int *fillsize = testcases[test_i]; + XML_Parser parser = XML_ParserCreate(NULL); + assert_true(parser != NULL); + g_parseAttempts = 0; + + CharData storage; + CharData_Init(&storage); + XML_SetUserData(parser, &storage); + XML_SetStartElementHandler(parser, start_element_event_handler); + + int worstcase_bytes = 0; // sum of (buffered bytes at each XML_Parse call) + int scanned_bytes = 0; // sum of (buffered bytes at each actual parse) + int offset = 0; + while (*fillsize >= 0) { + assert_true(offset + *fillsize <= document_length); // or test is invalid + const unsigned attempts_before = g_parseAttempts; + const enum XML_Status status + = XML_Parse(parser, &document[offset], *fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + offset += *fillsize; + fillsize++; + assert_true(offset <= INT_MAX - worstcase_bytes); // avoid overflow + worstcase_bytes += offset; // we might've tried to parse all pending bytes + if (g_parseAttempts != attempts_before) { + assert_true(g_parseAttempts == attempts_before + 1); // max 1/XML_Parse + assert_true(offset <= INT_MAX - scanned_bytes); // avoid overflow + scanned_bytes += offset; // we *did* try to parse all pending bytes + } + } + assert_true(storage.count == 1); // the big token should've been parsed + assert_true(scanned_bytes > 0); // test-the-test: does our counter work? + if (g_reparseDeferralEnabledDefault) { + // heuristic is enabled; some XML_Parse calls may have deferred reparsing + const int max_bytes_scanned = -*fillsize; + if (scanned_bytes > max_bytes_scanned) { + fprintf(stderr, + "bytes scanned in parse attempts: actual=%d limit=%d \n", + scanned_bytes, max_bytes_scanned); + fail("too many bytes scanned in parse attempts"); + } + assert_true(scanned_bytes <= worstcase_bytes); + } else { + // heuristic is disabled; every XML_Parse() will have reparsed + assert_true(scanned_bytes == worstcase_bytes); + } + + XML_ParserFree(parser); + } + free(document); +} +END_TEST + + /* * Namespaces tests. */ @@ -6902,13 +7529,13 @@ START_TEST(test_return_ns_triplet) { if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_FALSE) == XML_STATUS_ERROR) xml_failure(g_parser); - if (! triplet_start_flag) - fail("triplet_start_checker not invoked"); /* Check that unsetting "return triplets" fails while still parsing */ XML_SetReturnNSTriplet(g_parser, XML_FALSE); if (_XML_Parse_SINGLE_BYTES(g_parser, epilog, (int)strlen(epilog), XML_TRUE) == XML_STATUS_ERROR) xml_failure(g_parser); + if (! triplet_start_flag) + fail("triplet_start_checker not invoked"); if (! triplet_end_flag) fail("triplet_end_checker not invoked"); if (dummy_handler_flags @@ -12219,6 +12846,7 @@ make_suite(void) { #if defined(XML_CONTEXT_BYTES) tcase_add_test(tc_basic, test_get_buffer_3_overflow); #endif + tcase_add_test(tc_basic, test_getbuffer_allocates_on_zero_len); tcase_add_test(tc_basic, test_byte_info_at_end); tcase_add_test(tc_basic, test_byte_info_at_error); tcase_add_test(tc_basic, test_byte_info_at_cdata); @@ -12337,7 +12965,14 @@ make_suite(void) { tcase_add_test__ifdef_xml_dtd(tc_basic, test_pool_integrity_with_unfinished_attr); tcase_add_test(tc_basic, test_nested_entity_suspend); - + tcase_add_test(tc_basic, test_big_tokens_take_linear_time); + tcase_add_test(tc_basic, test_set_reparse_deferral); + tcase_add_test(tc_basic, test_reparse_deferral_is_inherited); + tcase_add_test(tc_basic, test_set_reparse_deferral_on_null_parser); + tcase_add_test(tc_basic, test_set_reparse_deferral_on_the_fly); + tcase_add_test(tc_basic, test_set_bad_reparse_option); + tcase_add_test(tc_basic, test_bypass_heuristic_when_close_to_bufsize); + tcase_add_test(tc_basic, test_varying_buffer_fills); suite_add_tcase(s, tc_namespace); tcase_add_checked_fixture(tc_namespace, namespace_setup, namespace_teardown); tcase_add_test(tc_namespace, test_return_ns_triplet); diff --git a/expat/xmlwf/xmlwf.c b/expat/xmlwf/xmlwf.c index 471f2a2..7c62919 100644 --- a/expat/xmlwf/xmlwf.c +++ b/expat/xmlwf/xmlwf.c @@ -914,6 +914,9 @@ usage(const XML_Char *prog, int rc) { T(" -a FACTOR set maximum tolerated [a]mplification factor (default: 100.0)\n") T(" -b BYTES set number of output [b]ytes needed to activate (default: 8 MiB)\n") T("\n") + T("reparse deferral:\n") + T(" -q disable reparse deferral, and allow [q]uadratic parse runtime with large tokens\n") + T("\n") T("info arguments:\n") T(" -h show this [h]elp message and exit\n") T(" -v show program's [v]ersion number and exit\n") @@ -967,6 +970,8 @@ tmain(int argc, XML_Char **argv) { unsigned long long attackThresholdBytes; XML_Bool attackThresholdGiven = XML_FALSE; + XML_Bool disableDeferral = XML_FALSE; + int exitCode = XMLWF_EXIT_SUCCESS; enum XML_ParamEntityParsing paramEntityParsing = XML_PARAM_ENTITY_PARSING_NEVER; @@ -1089,6 +1094,11 @@ tmain(int argc, XML_Char **argv) { #endif break; } + case T('q'): { + disableDeferral = XML_TRUE; + j++; + break; + } case T('\0'): if (j > 1) { i++; @@ -1134,6 +1144,16 @@ tmain(int argc, XML_Char **argv) { #endif } + if (disableDeferral) { + const XML_Bool success = XML_SetReparseDeferralEnabled(parser, XML_FALSE); + if (! success) { + // This prevents tperror(..) from reporting misleading "[..]: Success" + errno = EINVAL; + tperror(T("Failed to disable reparse deferral")); + exit(XMLWF_EXIT_INTERNAL_ERROR); + } + } + if (requireStandalone) XML_SetNotStandaloneHandler(parser, notStandalone); XML_SetParamEntityParsing(parser, paramEntityParsing); diff --git a/expat/xmlwf/xmlwf_helpgen.py b/expat/xmlwf/xmlwf_helpgen.py index c2a527f..1bd0a0a 100755 --- a/expat/xmlwf/xmlwf_helpgen.py +++ b/expat/xmlwf/xmlwf_helpgen.py @@ -81,6 +81,10 @@ billion_laughs.add_argument('-a', metavar='FACTOR', help='set maximum tolerated [a]mplification factor (default: 100.0)') billion_laughs.add_argument('-b', metavar='BYTES', help='set number of output [b]ytes needed to activate (default: 8 MiB)') +reparse_deferral = parser.add_argument_group('reparse deferral') +reparse_deferral.add_argument('-q', metavar='FACTOR', + help='disable reparse deferral, and allow [q]uadratic parse runtime with large tokens') + parser.add_argument('files', metavar='FILE', nargs='*', help='file to process (default: STDIN)') info = parser.add_argument_group('info arguments') diff --git a/testdata/largefiles/aaaaaa_attr.xml b/testdata/largefiles/aaaaaa_attr.xml new file mode 100644 index 0000000..66e3d25 --- /dev/null +++ b/testdata/largefiles/aaaaaa_attr.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/testdata/largefiles/aaaaaa_cdata.xml b/testdata/largefiles/aaaaaa_cdata.xml new file mode 100644 index 0000000..66f64bd --- /dev/null +++ b/testdata/largefiles/aaaaaa_cdata.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/testdata/largefiles/aaaaaa_comment.xml b/testdata/largefiles/aaaaaa_comment.xml new file mode 100644 index 0000000..bb9af13 --- /dev/null +++ b/testdata/largefiles/aaaaaa_comment.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/testdata/largefiles/aaaaaa_tag.xml b/testdata/largefiles/aaaaaa_tag.xml new file mode 100644 index 0000000..946f701 --- /dev/null +++ b/testdata/largefiles/aaaaaa_tag.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/testdata/largefiles/aaaaaa_text.xml b/testdata/largefiles/aaaaaa_text.xml new file mode 100644 index 0000000..e266acb --- /dev/null +++ b/testdata/largefiles/aaaaaa_text.xml @@ -0,0 +1 @@ +ACHARS \ No newline at end of file