From e46d1cdd8bd4cc12e8c8e8fdce10b3114a17d95e Mon Sep 17 00:00:00 2001
From: Robert Speicher <rspeicher@gmail.com>
Date: Mon, 27 Apr 2015 18:56:37 -0400
Subject: [PATCH] Add Gitlab::Markdown::SanitizationFilter

This just extends the HTML::Pipeline::SanitizationFilter with our custom
whitelist.
---
 app/helpers/gitlab_markdown_helper.rb         |  2 +-
 lib/gitlab/markdown.rb                        | 34 +-------
 lib/gitlab/markdown/sanitization_filter.rb    | 38 +++++++++
 spec/helpers/gitlab_markdown_helper_spec.rb   | 33 --------
 .../markdown/sanitization_filter_spec.rb      | 81 +++++++++++++++++++
 5 files changed, 123 insertions(+), 65 deletions(-)
 create mode 100644 lib/gitlab/markdown/sanitization_filter.rb
 create mode 100644 spec/lib/gitlab/markdown/sanitization_filter_spec.rb

diff --git a/app/helpers/gitlab_markdown_helper.rb b/app/helpers/gitlab_markdown_helper.rb
index 7dbffaae5f9..24263a0f619 100644
--- a/app/helpers/gitlab_markdown_helper.rb
+++ b/app/helpers/gitlab_markdown_helper.rb
@@ -34,7 +34,7 @@ module GitlabMarkdownHelper
 
       # see https://github.com/vmg/redcarpet#darling-i-packed-you-a-couple-renderers-for-lunch
       rend = Redcarpet::Render::GitlabHTML.new(self, user_color_scheme_class, {
-        # Handled further down the line by HTML::Pipeline::SanitizationFilter
+        # Handled further down the line by Gitlab::Markdown::SanitizationFilter
         escape_html: false
       }.merge(options))
 
diff --git a/lib/gitlab/markdown.rb b/lib/gitlab/markdown.rb
index beb97bbdf41..e7ddaab5c2a 100644
--- a/lib/gitlab/markdown.rb
+++ b/lib/gitlab/markdown.rb
@@ -38,6 +38,7 @@ module Gitlab
     autoload :IssueReferenceFilter,         'gitlab/markdown/issue_reference_filter'
     autoload :LabelReferenceFilter,         'gitlab/markdown/label_reference_filter'
     autoload :MergeRequestReferenceFilter,  'gitlab/markdown/merge_request_reference_filter'
+    autoload :SanitizationFilter,           'gitlab/markdown/sanitization_filter'
     autoload :SnippetReferenceFilter,       'gitlab/markdown/snippet_reference_filter'
     autoload :TableOfContentsFilter,        'gitlab/markdown/table_of_contents_filter'
     autoload :UserReferenceFilter,          'gitlab/markdown/user_reference_filter'
@@ -76,9 +77,6 @@ module Gitlab
       pipeline = HTML::Pipeline.new(filters)
 
       context = {
-        # SanitizationFilter
-        whitelist: sanitization_whitelist,
-
         # EmojiFilter
         asset_root: Gitlab.config.gitlab.url,
         asset_host: Gitlab::Application.config.asset_host,
@@ -116,10 +114,10 @@ module Gitlab
     # SanitizationFilter should come first so that all generated reference HTML
     # goes through untouched.
     #
-    # See https://gitlab.com/gitlab-org/html-pipeline-gitlab for more filters
+    # See https://github.com/jch/html-pipeline#filters for more filters.
     def filters
       [
-        HTML::Pipeline::SanitizationFilter,
+        Gitlab::Markdown::SanitizationFilter,
 
         Gitlab::Markdown::EmojiFilter,
         Gitlab::Markdown::TableOfContentsFilter,
@@ -136,32 +134,6 @@ module Gitlab
       ]
     end
 
-    # Customize the SanitizationFilter whitelist
-    #
-    # - Allow `class` and `id` attributes on all elements
-    # - Allow `span` elements
-    # - Remove `rel` attributes from `a` elements
-    # - Remove `a` nodes with `javascript:` in the `href` attribute
-    def sanitization_whitelist
-      whitelist = HTML::Pipeline::SanitizationFilter::WHITELIST
-      whitelist[:attributes][:all].push('class', 'id')
-      whitelist[:elements].push('span')
-
-      fix_anchors = lambda do |env|
-        name, node = env[:node_name], env[:node]
-        if name == 'a'
-          node.remove_attribute('rel')
-          if node['href'] && node['href'].match('javascript:')
-            node.remove_attribute('href')
-          end
-        end
-      end
-
-      whitelist[:transformers].push(fix_anchors)
-
-      whitelist
-    end
-
     # Turn list items that start with "[ ]" into HTML checkbox inputs.
     def parse_tasks(text)
       li_tag = '<li class="task-list-item">'
diff --git a/lib/gitlab/markdown/sanitization_filter.rb b/lib/gitlab/markdown/sanitization_filter.rb
new file mode 100644
index 00000000000..9a154e0b2fe
--- /dev/null
+++ b/lib/gitlab/markdown/sanitization_filter.rb
@@ -0,0 +1,38 @@
+require 'html/pipeline/filter'
+require 'html/pipeline/sanitization_filter'
+
+module Gitlab
+  module Markdown
+    # Sanitize HTML
+    #
+    # Extends HTML::Pipeline::SanitizationFilter with a custom whitelist.
+    class SanitizationFilter < HTML::Pipeline::SanitizationFilter
+      def whitelist
+        whitelist = HTML::Pipeline::SanitizationFilter::WHITELIST
+
+        # Allow `class` and `id` on all elements
+        whitelist[:attributes][:all].push('class', 'id')
+
+        # Allow table alignment
+        whitelist[:attributes]['th'] = %w(style)
+        whitelist[:attributes]['td'] = %w(style)
+
+        # Allow span elements
+        whitelist[:elements].push('span')
+
+        # Remove `rel` attribute from `a` elements
+        whitelist[:transformers].push(remove_rel)
+
+        whitelist
+      end
+
+      def remove_rel
+        lambda do |env|
+          if env[:node_name] == 'a'
+            env[:node].remove_attribute('rel')
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/spec/helpers/gitlab_markdown_helper_spec.rb b/spec/helpers/gitlab_markdown_helper_spec.rb
index bd2240c5997..ff0f049ce6c 100644
--- a/spec/helpers/gitlab_markdown_helper_spec.rb
+++ b/spec/helpers/gitlab_markdown_helper_spec.rb
@@ -316,11 +316,6 @@ describe GitlabMarkdownHelper do
           expected = ""
           expect(markdown(actual)).to match(expected)
         end
-
-        it 'should allow whitelisted HTML tags from the user' do
-          actual = '<dl><dt>Term</dt><dd>Definition</dd></dl>'
-          expect(markdown(actual)).to match(actual)
-        end
       end
 
       context 'with an empty repository' do
@@ -336,34 +331,6 @@ describe GitlabMarkdownHelper do
         end
       end
     end
-
-    # SANITIZATION ------------------------------------------------------------
-    # TODO (rspeicher): These are testing SanitizationFilter, not `markdown`
-
-    it 'should sanitize tags that are not whitelisted' do
-      actual = '<textarea>no inputs allowed</textarea> <blink>no blinks</blink>'
-      expected = 'no inputs allowed no blinks'
-      expect(markdown(actual)).to match(expected)
-      expect(markdown(actual)).not_to match('<.textarea>')
-      expect(markdown(actual)).not_to match('<.blink>')
-    end
-
-    it 'should allow whitelisted tag attributes from the user' do
-      actual = '<a class="custom">link text</a>'
-      expect(markdown(actual)).to match(actual)
-    end
-
-    it 'should sanitize tag attributes that are not whitelisted' do
-      actual = '<a href="http://example.com/bar.html" foo="bar">link text</a>'
-      expected = '<a href="http://example.com/bar.html">link text</a>'
-      expect(markdown(actual)).to match(expected)
-    end
-
-    it 'should sanitize javascript in attributes' do
-      actual = %q(<a href="javascript:alert('foo')">link text</a>)
-      expected = '<a>link text</a>'
-      expect(markdown(actual)).to match(expected)
-    end
   end
 
   describe '#render_wiki_content' do
diff --git a/spec/lib/gitlab/markdown/sanitization_filter_spec.rb b/spec/lib/gitlab/markdown/sanitization_filter_spec.rb
new file mode 100644
index 00000000000..ab909a68635
--- /dev/null
+++ b/spec/lib/gitlab/markdown/sanitization_filter_spec.rb
@@ -0,0 +1,81 @@
+require 'spec_helper'
+
+module Gitlab::Markdown
+  describe SanitizationFilter do
+    def filter(html, options = {})
+      described_class.call(html, options)
+    end
+
+    describe 'default whitelist' do
+      it 'sanitizes tags that are not whitelisted' do
+        act = %q{<textarea>no inputs</textarea> and <blink>no blinks</blink>}
+        exp = 'no inputs and no blinks'
+        expect(filter(act).to_html).to eq exp
+      end
+
+      it 'sanitizes tag attributes' do
+        act = %q{<a href="http://example.com/bar.html" onclick="bar">Text</a>}
+        exp = %q{<a href="http://example.com/bar.html">Text</a>}
+        expect(filter(act).to_html).to eq exp
+      end
+
+      it 'sanitizes javascript in attributes' do
+        act = %q(<a href="javascript:alert('foo')">Text</a>)
+        exp = '<a>Text</a>'
+        expect(filter(act).to_html).to eq exp
+      end
+
+      it 'allows whitelisted HTML tags from the user' do
+        exp = act = "<dl>\n<dt>Term</dt>\n<dd>Definition</dd>\n</dl>"
+        expect(filter(act).to_html).to eq exp
+      end
+    end
+
+    describe 'custom whitelist' do
+      it 'allows `class` attribute on any element' do
+        exp = act = %q{<strong class="foo">Strong</strong>}
+        expect(filter(act).to_html).to eq exp
+      end
+
+      it 'allows `id` attribute on any element' do
+        exp = act = %q{<em id="foo">Emphasis</em>}
+        expect(filter(act).to_html).to eq exp
+      end
+
+      it 'allows `style` attribute on table elements' do
+        html = <<-HTML.strip_heredoc
+        <table>
+          <tr><th style="text-align: center">Head</th></tr>
+          <tr><td style="text-align: right">Body</th></tr>
+        </table>
+        HTML
+
+        doc = filter(html)
+
+        expect(doc.at_css('th')['style']).to eq 'text-align: center'
+        expect(doc.at_css('td')['style']).to eq 'text-align: right'
+      end
+
+      it 'allows `span` elements' do
+        exp = act = %q{<span>Hello</span>}
+        expect(filter(act).to_html).to eq exp
+      end
+
+      it 'removes `rel` attribute from `a` elements' do
+        doc = filter(%q{<a href="#" rel="nofollow">Link</a>})
+
+        expect(doc.css('a').size).to eq 1
+        expect(doc.at_css('a')['href']).to eq '#'
+        expect(doc.at_css('a')['rel']).to be_nil
+      end
+
+      it 'removes script-like `href` attribute from `a` elements' do
+        html = %q{<a href="javascript:alert('Hi')">Hi</a>}
+        doc = filter(html)
+
+        expect(doc.css('a').size).to eq 1
+        expect(doc.at_css('a')['href']).to be_nil
+      end
+    end
+  end
+end
-- 
GitLab