From b3b4d0847c0b22a6f2b12090d8b6b79c4cdea95c Mon Sep 17 00:00:00 2001 From: Bert JW Regeer Date: Sat, 12 Mar 2022 18:30:30 -0700 Subject: [PATCH 1/8] Add new regular expressions for Chunked Encoding This also moves some regular expressions for QUOTED_PAIR/QUOTED_STRING into this module from utilities so that they may be reused. (cherry picked from commit e75b0d9afbea8a933f8f5f11d279e661cbfd676b) --- waitress/rfc7230.py | 27 ++++++++++++++++++++++++++- waitress/utilities.py | 28 +++------------------------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/waitress/rfc7230.py b/waitress/rfc7230.py index cd33c90..4c4c0a9 100644 --- a/waitress/rfc7230.py +++ b/waitress/rfc7230.py @@ -7,6 +7,9 @@ import re from .compat import tobytes +HEXDIG = "[0-9a-fA-F]" +DIGIT = "[0-9]" + WS = "[ \t]" OWS = WS + "{0,}?" RWS = WS + "{1,}?" @@ -27,6 +30,12 @@ TOKEN = TCHAR + "{1,}" # ; visible (printing) characters VCHAR = r"\x21-\x7e" +# The '\\' between \x5b and \x5d is needed to escape \x5d (']') +QDTEXT = "[\t \x21\x23-\x5b\\\x5d-\x7e" + OBS_TEXT + "]" + +QUOTED_PAIR = r"\\" + "([\t " + VCHAR + OBS_TEXT + "])" +QUOTED_STRING = '"(?:(?:' + QDTEXT + ")|(?:" + QUOTED_PAIR + '))*"' + # header-field = field-name ":" OWS field-value OWS # field-name = token # field-value = *( field-content / obs-fold ) @@ -45,8 +54,24 @@ FIELD_CONTENT = FIELD_VCHAR + "+(?:[ \t]+" + FIELD_VCHAR + "+)*" # Which allows the field value here to just see if there is even a value in the first place FIELD_VALUE = "(?:" + FIELD_CONTENT + ")?" -HEADER_FIELD = re.compile( +# chunk-ext = *( ";" chunk-ext-name [ "=" chunk-ext-val ] ) +# chunk-ext-name = token +# chunk-ext-val = token / quoted-string + +CHUNK_EXT_NAME = TOKEN +CHUNK_EXT_VAL = "(?:" + TOKEN + ")|(?:" + QUOTED_STRING + ")" +CHUNK_EXT = ( + "(?:;(?P" + CHUNK_EXT_NAME + ")(?:=(?P" + CHUNK_EXT_VAL + "))?)*" +) + +# Pre-compiled regular expressions for use elsewhere +ONLY_HEXDIG_RE = re.compile(tobytes("^" + HEXDIG + "+$")) +ONLY_DIGIT_RE = re.compile(tobytes("^" + DIGIT + "+$")) +HEADER_FIELD_RE = re.compile( tobytes( "^(?P" + TOKEN + "):" + OWS + "(?P" + FIELD_VALUE + ")" + OWS + "$" ) ) +QUOTED_PAIR_RE = re.compile(QUOTED_PAIR) +QUOTED_STRING_RE = re.compile(QUOTED_STRING) +CHUNK_EXT_RE = re.compile(tobytes("^" + CHUNK_EXT + "$")) diff --git a/waitress/utilities.py b/waitress/utilities.py index 556bed2..fa59657 100644 --- a/waitress/utilities.py +++ b/waitress/utilities.py @@ -22,7 +22,7 @@ import re import stat import time -from .rfc7230 import OBS_TEXT, VCHAR +from .rfc7230 import QUOTED_PAIR_RE, QUOTED_STRING_RE logger = logging.getLogger("waitress") queue_logger = logging.getLogger("waitress.queue") @@ -216,32 +216,10 @@ def parse_http_date(d): return retval -# RFC 5234 Appendix B.1 "Core Rules": -# VCHAR = %x21-7E -# ; visible (printing) characters -vchar_re = VCHAR - -# RFC 7230 Section 3.2.6 "Field Value Components": -# quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE -# qdtext = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text -# obs-text = %x80-FF -# quoted-pair = "\" ( HTAB / SP / VCHAR / obs-text ) -obs_text_re = OBS_TEXT - -# The '\\' between \x5b and \x5d is needed to escape \x5d (']') -qdtext_re = "[\t \x21\x23-\x5b\\\x5d-\x7e" + obs_text_re + "]" - -quoted_pair_re = r"\\" + "([\t " + vchar_re + obs_text_re + "])" -quoted_string_re = '"(?:(?:' + qdtext_re + ")|(?:" + quoted_pair_re + '))*"' - -quoted_string = re.compile(quoted_string_re) -quoted_pair = re.compile(quoted_pair_re) - - def undquote(value): if value.startswith('"') and value.endswith('"'): # So it claims to be DQUOTE'ed, let's validate that - matches = quoted_string.match(value) + matches = QUOTED_STRING_RE.match(value) if matches and matches.end() == len(value): # Remove the DQUOTE's from the value @@ -249,7 +227,7 @@ def undquote(value): # Remove all backslashes that are followed by a valid vchar or # obs-text - value = quoted_pair.sub(r"\1", value) + value = QUOTED_PAIR_RE.sub(r"\1", value) return value elif not value.startswith('"') and not value.endswith('"'): -- 2.45.2