From 16f8ca566b8637dc8092a6b630c23a82a905b437 Mon Sep 17 00:00:00 2001
From: Robert Speicher <rspeicher@gmail.com>
Date: Thu, 1 Oct 2015 23:40:29 -0400
Subject: [PATCH] Add custom protocol whitelisting to SanitizationFilter

Addresses internal https://dev.gitlab.org/gitlab/gitlabhq/issues/2613
---
 lib/gitlab/markdown/sanitization_filter.rb    |  19 ++++
 .../markdown/sanitization_filter_spec.rb      | 101 ++++++++++++++++--
 2 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/lib/gitlab/markdown/sanitization_filter.rb b/lib/gitlab/markdown/sanitization_filter.rb
index e368de7d848..ffb9dc33b64 100644
--- a/lib/gitlab/markdown/sanitization_filter.rb
+++ b/lib/gitlab/markdown/sanitization_filter.rb
@@ -48,6 +48,12 @@ module Gitlab
         # Allow span elements
         whitelist[:elements].push('span')
 
+        # Allow any protocol in `a` elements...
+        whitelist[:protocols].delete('a')
+
+        # ...but then remove links with the `javascript` protocol
+        whitelist[:transformers].push(remove_javascript_links)
+
         # Remove `rel` attribute from `a` elements
         whitelist[:transformers].push(remove_rel)
 
@@ -57,6 +63,19 @@ module Gitlab
         whitelist
       end
 
+      def remove_javascript_links
+        lambda do |env|
+          node = env[:node]
+
+          return unless node.name == 'a'
+          return unless node.has_attribute?('href')
+
+          if node['href'].start_with?('javascript', ':javascript')
+            node.remove_attribute('href')
+          end
+        end
+      end
+
       def remove_rel
         lambda do |env|
           if env[:node_name] == 'a'
diff --git a/spec/lib/gitlab/markdown/sanitization_filter_spec.rb b/spec/lib/gitlab/markdown/sanitization_filter_spec.rb
index e50c82d0b3c..27cd00e8054 100644
--- a/spec/lib/gitlab/markdown/sanitization_filter_spec.rb
+++ b/spec/lib/gitlab/markdown/sanitization_filter_spec.rb
@@ -44,7 +44,7 @@ module Gitlab::Markdown
         instance = described_class.new('Foo')
         3.times { instance.whitelist }
 
-        expect(instance.whitelist[:transformers].size).to eq 4
+        expect(instance.whitelist[:transformers].size).to eq 5
       end
 
       it 'allows syntax highlighting' do
@@ -77,19 +77,100 @@ module Gitlab::Markdown
       end
 
       it 'removes `rel` attribute from `a` elements' do
-        doc = filter(%q{<a href="#" rel="nofollow">Link</a>})
+        act = %q{<a href="#" rel="nofollow">Link</a>}
+        exp = %q{<a href="#">Link</a>}
 
-        expect(doc.css('a').size).to eq 1
-        expect(doc.at_css('a')['href']).to eq '#'
-        expect(doc.at_css('a')['rel']).to be_nil
+        expect(filter(act).to_html).to eq exp
       end
 
-      it 'removes script-like `href` attribute from `a` elements' do
-        html = %q{<a href="javascript:alert('Hi')">Hi</a>}
-        doc = filter(html)
+      # Adapted from the Sanitize test suite: http://git.io/vczrM
+      protocols = {
+        'protocol-based JS injection: simple, no spaces' => {
+          input:  '<a href="javascript:alert(\'XSS\');">foo</a>',
+          output: '<a>foo</a>'
+        },
+
+        'protocol-based JS injection: simple, spaces before' => {
+          input:  '<a href="javascript    :alert(\'XSS\');">foo</a>',
+          output: '<a>foo</a>'
+        },
+
+        'protocol-based JS injection: simple, spaces after' => {
+          input:  '<a href="javascript:    alert(\'XSS\');">foo</a>',
+          output: '<a>foo</a>'
+        },
+
+        'protocol-based JS injection: simple, spaces before and after' => {
+          input:  '<a href="javascript    :   alert(\'XSS\');">foo</a>',
+          output: '<a>foo</a>'
+        },
+
+        'protocol-based JS injection: preceding colon' => {
+          input:  '<a href=":javascript:alert(\'XSS\');">foo</a>',
+          output: '<a>foo</a>'
+        },
+
+        'protocol-based JS injection: UTF-8 encoding' => {
+          input:  '<a href="javascript&#58;">foo</a>',
+          output: '<a>foo</a>'
+        },
+
+        'protocol-based JS injection: long UTF-8 encoding' => {
+          input:  '<a href="javascript&#0058;">foo</a>',
+          output: '<a>foo</a>'
+        },
+
+        'protocol-based JS injection: long UTF-8 encoding without semicolons' => {
+          input:  '<a href=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>foo</a>',
+          output: '<a>foo</a>'
+        },
+
+        'protocol-based JS injection: hex encoding' => {
+          input:  '<a href="javascript&#x3A;">foo</a>',
+          output: '<a>foo</a>'
+        },
+
+        'protocol-based JS injection: long hex encoding' => {
+          input:  '<a href="javascript&#x003A;">foo</a>',
+          output: '<a>foo</a>'
+        },
+
+        'protocol-based JS injection: hex encoding without semicolons' => {
+          input:  '<a href=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>foo</a>',
+          output: '<a>foo</a>'
+        },
+
+        'protocol-based JS injection: null char' => {
+          input:  "<a href=java\0script:alert(\"XSS\")>foo</a>",
+          output: '<a href="java"></a>'
+        },
+
+        'protocol-based JS injection: spaces and entities' => {
+          input:  '<a href=" &#14;  javascript:alert(\'XSS\');">foo</a>',
+          output: '<a href="">foo</a>'
+        },
+      }
+
+      protocols.each do |name, data|
+        it "handles #{name}" do
+          doc = filter(data[:input])
+
+          expect(doc.to_html).to eq data[:output]
+        end
+      end
+
+      it 'allows non-standard anchor schemes' do
+        exp = %q{<a href="irc://irc.freenode.net/git">IRC</a>}
+        act = filter(exp)
+
+        expect(act.to_html).to eq exp
+      end
+
+      it 'allows relative links' do
+        exp = %q{<a href="foo/bar.md">foo/bar.md</a>}
+        act = filter(exp)
 
-        expect(doc.css('a').size).to eq 1
-        expect(doc.at_css('a')['href']).to be_nil
+        expect(act.to_html).to eq exp
       end
     end
 
-- 
GitLab