python-lxml/SOURCES/CVE-2021-43818.patch

diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index 0492fca..5225a5e 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -75,18 +75,25 @@ _looks_like_tag_content = re.compile(
 
 # All kinds of schemes besides just javascript: that can cause
 # execution:
-_is_image_dataurl = re.compile(
-    r'^data:image/.+;base64', re.I).search
+_find_image_dataurls = re.compile(
+    r'^data:image/(.+);base64,', re.I).findall
 _is_possibly_malicious_scheme = re.compile(
-    r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
-    re.I).search
+    r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
+    re.I).findall
+# SVG images can contain script content
+_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).findall
+
 def _is_javascript_scheme(s):
-    if _is_image_dataurl(s):
-        return None
-    return _is_possibly_malicious_scheme(s)
+    is_image_url = False
+    for image_type in _find_image_dataurls(s):
+        is_image_url = True
+        if _is_unsafe_image_type(image_type):
+            return True
+    if is_image_url:
+        return False
+    return bool(_is_possibly_malicious_scheme(s))
 
 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
-# FIXME: should data: be blocked?
 
 # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
 _conditional_comment_re = re.compile(
@@ -514,6 +521,8 @@ class Cleaner(object):
             return True
         if 'expression(' in style:
             return True
+        if '@import' in style:
+            return True
         if '</noscript' in style:
             # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
             return True
diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py
index e40cdad..4fab442 100644
--- a/src/lxml/html/tests/test_clean.py
+++ b/src/lxml/html/tests/test_clean.py
@@ -1,3 +1,6 @@
+import base64
+import gzip
+import io
 import unittest, sys
 from lxml.tests.common_imports import make_doctest
 from lxml.etree import LIBXML_VERSION
@@ -89,6 +92,73 @@ class CleanerTest(unittest.TestCase):
             b'<math><style>/* deleted */</style></math>',
             lxml.html.tostring(clean_html(s)))
 
+    def test_sneaky_import_in_style(self):
+        # Prevent "@@importimport" -> "@import" replacement.
+        style_codes = [
+            "@@importimport(extstyle.css)",
+            "@ @  import import(extstyle.css)",
+            "@ @ importimport(extstyle.css)",
+            "@@  import import(extstyle.css)",
+            "@ @import import(extstyle.css)",
+            "@@importimport()",
+        ]
+        for style_code in style_codes:
+            html = '<style>%s</style>' % style_code
+            s = lxml.html.fragment_fromstring(html)
+
+            cleaned = lxml.html.tostring(clean_html(s))
+            self.assertEqual(
+                b'<style>/* deleted */</style>',
+                cleaned,
+                "%s  ->  %s" % (style_code, cleaned))
+
+    def test_svg_data_links(self):
+        # Remove SVG images with potentially insecure content.
+        svg = b'<svg onload="alert(123)" />'
+        gzout = io.BytesIO()
+        f = gzip.GzipFile(fileobj=gzout, mode='wb')
+        f.write(svg)
+        f.close()
+        svgz = gzout.getvalue()
+        svg_b64 = base64.b64encode(svg).decode('ASCII')
+        svgz_b64 = base64.b64encode(svgz).decode('ASCII')
+        urls = [
+            "data:image/svg+xml;base64," + svg_b64,
+            "data:image/svg+xml-compressed;base64," + svgz_b64,
+        ]
+        for url in urls:
+            html = '<img src="%s">' % url
+            s = lxml.html.fragment_fromstring(html)
+
+            cleaned = lxml.html.tostring(clean_html(s))
+            self.assertEqual(
+                b'<img src="">',
+                cleaned,
+                "%s  ->  %s" % (url, cleaned))
+
+    def test_image_data_links(self):
+        data = b'123'
+        data_b64 = base64.b64encode(data).decode('ASCII')
+        urls = [
+            "data:image/jpeg;base64," + data_b64,
+            "data:image/apng;base64," + data_b64,
+            "data:image/png;base64," + data_b64,
+            "data:image/gif;base64," + data_b64,
+            "data:image/webp;base64," + data_b64,
+            "data:image/bmp;base64," + data_b64,
+            "data:image/tiff;base64," + data_b64,
+            "data:image/x-icon;base64," + data_b64,
+        ]
+        for url in urls:
+            html = '<img src="%s">' % url
+            s = lxml.html.fragment_fromstring(html)
+
+            cleaned = lxml.html.tostring(clean_html(s))
+            self.assertEqual(
+                html.encode("UTF-8"),
+                cleaned,
+                "%s  ->  %s" % (url, cleaned))
+
     def test_formaction_attribute_in_button_input(self):
         # The formaction attribute overrides the form's action and should be
         # treated as a malicious link attribute
import python-lxml-4.2.3-6.module+el8.9.0+19487+7dc18407 11 months ago			`diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py`
			`index 0492fca..5225a5e 100644`
			`--- a/src/lxml/html/clean.py`
			`+++ b/src/lxml/html/clean.py`
			`@@ -75,18 +75,25 @@ _looks_like_tag_content = re.compile(`

			`# All kinds of schemes besides just javascript: that can cause`
			`# execution:`
			`-_is_image_dataurl = re.compile(`
			`- r'^data:image/.+;base64', re.I).search`
			`+_find_image_dataurls = re.compile(`
			`+ r'^data:image/(.+);base64,', re.I).findall`
			`_is_possibly_malicious_scheme = re.compile(`
			`- r'(?:javascript\|jscript\|livescript\|vbscript\|data\|about\|mocha):',`
			`- re.I).search`
			`+ r'(javascript\|jscript\|livescript\|vbscript\|data\|about\|mocha):',`
			`+ re.I).findall`
			`+# SVG images can contain script content`
			`+_is_unsafe_image_type = re.compile(r"(xml\|svg)", re.I).findall`
			`+`
			`def _is_javascript_scheme(s):`
			`- if _is_image_dataurl(s):`
			`- return None`
			`- return _is_possibly_malicious_scheme(s)`
			`+ is_image_url = False`
			`+ for image_type in _find_image_dataurls(s):`
			`+ is_image_url = True`
			`+ if _is_unsafe_image_type(image_type):`
			`+ return True`
			`+ if is_image_url:`
			`+ return False`
			`+ return bool(_is_possibly_malicious_scheme(s))`

			`_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub`
			`-# FIXME: should data: be blocked?`

			`# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx`
			`_conditional_comment_re = re.compile(`
			`@@ -514,6 +521,8 @@ class Cleaner(object):`
			`return True`
			`if 'expression(' in style:`
			`return True`
			`+ if '@import' in style:`
			`+ return True`
			`if '</noscript' in style:`
			`# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'`
			`return True`
			`diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py`
			`index e40cdad..4fab442 100644`
			`--- a/src/lxml/html/tests/test_clean.py`
			`+++ b/src/lxml/html/tests/test_clean.py`
			`@@ -1,3 +1,6 @@`
			`+import base64`
			`+import gzip`
			`+import io`
			`import unittest, sys`
			`from lxml.tests.common_imports import make_doctest`
			`from lxml.etree import LIBXML_VERSION`
			`@@ -89,6 +92,73 @@ class CleanerTest(unittest.TestCase):`
			`b'<math><style>/* deleted */</style></math>',`
			`lxml.html.tostring(clean_html(s)))`

			`+ def test_sneaky_import_in_style(self):`
			`+ # Prevent "@@importimport" -> "@import" replacement.`
			`+ style_codes = [`
			`+ "@@importimport(extstyle.css)",`
			`+ "@ @ import import(extstyle.css)",`
			`+ "@ @ importimport(extstyle.css)",`
			`+ "@@ import import(extstyle.css)",`
			`+ "@ @import import(extstyle.css)",`
			`+ "@@importimport()",`
			`+ ]`
			`+ for style_code in style_codes:`
			`+ html = '<style>%s</style>' % style_code`
			`+ s = lxml.html.fragment_fromstring(html)`
			`+`
			`+ cleaned = lxml.html.tostring(clean_html(s))`
			`+ self.assertEqual(`
			`+ b'<style>/* deleted */</style>',`
			`+ cleaned,`
			`+ "%s -> %s" % (style_code, cleaned))`
			`+`
			`+ def test_svg_data_links(self):`
			`+ # Remove SVG images with potentially insecure content.`
			`+ svg = b'<svg onload="alert(123)" />'`
			`+ gzout = io.BytesIO()`
			`+ f = gzip.GzipFile(fileobj=gzout, mode='wb')`
			`+ f.write(svg)`
			`+ f.close()`
			`+ svgz = gzout.getvalue()`
			`+ svg_b64 = base64.b64encode(svg).decode('ASCII')`
			`+ svgz_b64 = base64.b64encode(svgz).decode('ASCII')`
			`+ urls = [`
			`+ "data:image/svg+xml;base64," + svg_b64,`
			`+ "data:image/svg+xml-compressed;base64," + svgz_b64,`
			`+ ]`
			`+ for url in urls:`
			`+ html = '<img src="%s">' % url`
			`+ s = lxml.html.fragment_fromstring(html)`
			`+`
			`+ cleaned = lxml.html.tostring(clean_html(s))`
			`+ self.assertEqual(`
			`+ b'<img src="">',`
			`+ cleaned,`
			`+ "%s -> %s" % (url, cleaned))`
			`+`
			`+ def test_image_data_links(self):`
			`+ data = b'123'`
			`+ data_b64 = base64.b64encode(data).decode('ASCII')`
			`+ urls = [`
			`+ "data:image/jpeg;base64," + data_b64,`
			`+ "data:image/apng;base64," + data_b64,`
			`+ "data:image/png;base64," + data_b64,`
			`+ "data:image/gif;base64," + data_b64,`
			`+ "data:image/webp;base64," + data_b64,`
			`+ "data:image/bmp;base64," + data_b64,`
			`+ "data:image/tiff;base64," + data_b64,`
			`+ "data:image/x-icon;base64," + data_b64,`
			`+ ]`
			`+ for url in urls:`
			`+ html = '<img src="%s">' % url`
			`+ s = lxml.html.fragment_fromstring(html)`
			`+`
			`+ cleaned = lxml.html.tostring(clean_html(s))`
			`+ self.assertEqual(`
			`+ html.encode("UTF-8"),`
			`+ cleaned,`
			`+ "%s -> %s" % (url, cleaned))`
			`+`
			`def test_formaction_attribute_in_button_input(self):`
			`# The formaction attribute overrides the form's action and should be`
			`# treated as a malicious link attribute`