python-lxml/0002-Fix-CVE-2020-27783-mXS...

From 06fbba68024e0287b8e8f6da85ce840ffbfec3bc Mon Sep 17 00:00:00 2001
From: Mikolaj Izdebski <mizdebsk@redhat.com>
Date: Fri, 18 Dec 2020 16:08:43 +0100
Subject: [PATCH 2/2] Fix CVE-2020-27783: mXSS due to the use of improper
 parser

Backported from upstream commits 89e7aad6e7ff9ecd88678ff25f885988b184b26e
and a105ab8dc262ec6735977c25c13f0bdfcdec72a7
---
 src/lxml/html/clean.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index da1f8706..c4fbfaa3 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -61,12 +61,15 @@ __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',

 # This is an IE-specific construct you can have in a stylesheet to
 # run some Javascript:
-_css_javascript_re = re.compile(
-    r'expression\s*\(.*?\)', re.S|re.I)
+_replace_css_javascript = re.compile(
+    r'expression\s*\(.*?\)', re.S|re.I).sub

 # Do I have to worry about @\nimport?
-_css_import_re = re.compile(
-    r'@\s*import', re.I)
+_replace_css_import = re.compile(
+    r'@\s*import', re.I).sub
+
+_looks_like_tag_content = re.compile(
+    r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=', re.ASCII).search

 # All kinds of schemes besides just javascript: that can cause
 # execution:
@@ -292,8 +295,8 @@ class Cleaner(object):
             if not self.inline_style:
                 for el in _find_styled_elements(doc):
                     old = el.get('style')
-                    new = _css_javascript_re.sub('', old)
-                    new = _css_import_re.sub('', new)
+                    new = _replace_css_javascript('', old)
+                    new = _replace_css_import('', new)
                     if self._has_sneaky_javascript(new):
                         # Something tricky is going on...
                         del el.attrib['style']
@@ -305,9 +308,9 @@ class Cleaner(object):
                         el.drop_tree()
                         continue
                     old = el.text or ''
-                    new = _css_javascript_re.sub('', old)
+                    new = _replace_css_javascript('', old)
                     # The imported CSS can do anything; we just can't allow:
-                    new = _css_import_re.sub('', old)
+                    new = _replace_css_import('', new)
                     if self._has_sneaky_javascript(new):
                         # Something tricky is going on...
                         el.text = '/* deleted */'
@@ -522,6 +525,12 @@ class Cleaner(object):
             return True
         if 'expression(' in style:
             return True
+        if '</noscript' in style:
+            # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+            return True
+        if _looks_like_tag_content(style):
+            # e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
+            return True
         return False

     def clean_html(self, html):
--
2.26.2