From 918397797178f667d3d08f36893b7c17bc11f263 Mon Sep 17 00:00:00 2001
From: Stan Hu <stanhu@gmail.com>
Date: Sat, 4 Jun 2016 16:37:48 -0700
Subject: [PATCH 1/3] Increase the max amount of blob data to load from 1K to
 10 MB

Before we were only loading 1024 bytes, which could lead to false negative
detection of binary data since binary characters could come after the 1024-byte
mark. Instead, we should load as much data as we are willing to display
to ensure that this does not happen.

Relates to gitlab-org/gitlab-ce#13826
---
 CHANGELOG              |  3 +++
 lib/gitlab_git/blob.rb | 15 ++++++++-------
 spec/blob_spec.rb      |  8 ++++----
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 771e4ce..704f986 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,6 @@
+v 10.1.3
+  - Increase the max amount of blob data to load from 1K to 10 MB
+
 v 10.1.2
   - DiffCollections now enforce a total byte limit of 5 KB * max_files
 
diff --git a/lib/gitlab_git/blob.rb b/lib/gitlab_git/blob.rb
index 275d88d..9ab33b9 100644
--- a/lib/gitlab_git/blob.rb
+++ b/lib/gitlab_git/blob.rb
@@ -7,10 +7,11 @@ module Gitlab
       include Linguist::BlobHelper
       include EncodingHelper
 
-      # This number needs to be large enough to allow reliable content /
-      # encoding detection (Linguist) and LFS pointer parsing. All other cases
-      # where we need full blob data should use load_all_data!.
-      DATA_FRAGMENT_SIZE = 1024
+      # This number is the maximum amount of data that we want to display to
+      # the user. We load as much as we can for encoding detection
+      # (Linguist) and LFS pointer parsing. All other cases where we need full
+      # blob data should use load_all_data!.
+      MAX_DATA_DISPLAY_SIZE = 10485760
 
       attr_accessor :name, :path, :size, :data, :mode, :id, :commit_id
 
@@ -33,7 +34,7 @@ module Gitlab
                 id: blob.oid,
                 name: blob_entry[:name],
                 size: blob.size,
-                data: blob.content(DATA_FRAGMENT_SIZE),
+                data: blob.content(MAX_DATA_DISPLAY_SIZE),
                 mode: blob_entry[:filemode].to_s(8),
                 path: path,
                 commit_id: sha,
@@ -48,7 +49,7 @@ module Gitlab
           Blob.new(
             id: blob.oid,
             size: blob.size,
-            data: blob.content(DATA_FRAGMENT_SIZE),
+            data: blob.content(MAX_DATA_DISPLAY_SIZE),
           )
         end
 
@@ -224,7 +225,7 @@ module Gitlab
         encode! @data
       end
 
-      # Load all blob data (not just the first DATA_FRAGMENT_SIZE bytes) into
+      # Load all blob data (not just the first MAX_DATA_DISPLAY_SIZE bytes) into
       # memory as a Ruby string.
       def load_all_data!(repository)
         return if @data == '' # don't mess with submodule blobs
diff --git a/spec/blob_spec.rb b/spec/blob_spec.rb
index 100f826..2ef7d0e 100644
--- a/spec/blob_spec.rb
+++ b/spec/blob_spec.rb
@@ -65,10 +65,10 @@ describe Gitlab::Git::Blob do
       let(:blob_size) { 111803 }
 
       it { expect(blob.size).to eq(blob_size) }
-      it { expect(blob.data.length).to eq(Gitlab::Git::Blob::DATA_FRAGMENT_SIZE) }
+      it { expect(blob.data.length).to eq(Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE) }
 
       it 'check that this test is sane' do
-        expect(blob.size).to be > Gitlab::Git::Blob::DATA_FRAGMENT_SIZE
+        expect(blob.size).to be > Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE
       end
 
       it 'can load all data' do
@@ -89,10 +89,10 @@ describe Gitlab::Git::Blob do
       let(:blob_size) { 111803 }
 
       it { expect(blob.size).to eq(blob_size) }
-      it { expect(blob.data.length).to eq(Gitlab::Git::Blob::DATA_FRAGMENT_SIZE) }
+      it { expect(blob.data.length).to eq(Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE) }
       
       it 'check that this test is sane' do
-        expect(blob.size).to be > Gitlab::Git::Blob::DATA_FRAGMENT_SIZE
+        expect(blob.size).to be > Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE
       end
     end
   end
-- 
GitLab


From 264be9ae9b4a5f8a30f26b79fcdce24ca697d68a Mon Sep 17 00:00:00 2001
From: Stan Hu <stanhu@gmail.com>
Date: Sat, 4 Jun 2016 21:34:48 -0700
Subject: [PATCH 2/3] Make specs pass for whatever data size limits used by
 Blob

---
 spec/blob_spec.rb | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/spec/blob_spec.rb b/spec/blob_spec.rb
index 2ef7d0e..0abb69f 100644
--- a/spec/blob_spec.rb
+++ b/spec/blob_spec.rb
@@ -65,10 +65,10 @@ describe Gitlab::Git::Blob do
       let(:blob_size) { 111803 }
 
       it { expect(blob.size).to eq(blob_size) }
-      it { expect(blob.data.length).to eq(Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE) }
+      it { expect(blob.data.length).to eq(blob_size) }
 
       it 'check that this test is sane' do
-        expect(blob.size).to be > Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE
+        expect(blob.size).to be <= Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE
       end
 
       it 'can load all data' do
@@ -85,14 +85,14 @@ describe Gitlab::Git::Blob do
     it { expect(raw_blob.size).to eq(669) }
 
     context 'large file' do
-      let(:blob) { Gitlab::Git::Blob.raw(repository, '08cf843fd8fe1c50757df0a13fcc44661996b4df') }
-      let(:blob_size) { 111803 }
-
-      it { expect(blob.size).to eq(blob_size) }
-      it { expect(blob.data.length).to eq(Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE) }
-      
-      it 'check that this test is sane' do
-        expect(blob.size).to be > Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE
+      it 'limits the size of a large file' do
+        blob_size = Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE + 1
+        buffer = Array.new(blob_size, 0)
+        rugged_blob = Rugged::Blob.from_buffer(repository.rugged, buffer.join(''))
+        blob = Gitlab::Git::Blob.raw(repository, rugged_blob)
+
+        expect(blob.size).to eq(blob_size)
+        expect(blob.data.length).to eq(Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE)
       end
     end
   end
-- 
GitLab


From a5b4fc84116005ba575fba239e394b8785db05a8 Mon Sep 17 00:00:00 2001
From: Stan Hu <stanhu@gmail.com>
Date: Wed, 8 Jun 2016 21:24:29 -0700
Subject: [PATCH 3/3] Add method to indicate blob was truncated

---
 lib/gitlab_git/blob.rb | 4 ++++
 spec/blob_spec.rb      | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/lib/gitlab_git/blob.rb b/lib/gitlab_git/blob.rb
index 9ab33b9..b95a97a 100644
--- a/lib/gitlab_git/blob.rb
+++ b/lib/gitlab_git/blob.rb
@@ -266,6 +266,10 @@ module Gitlab
         nil
       end
 
+      def truncated?
+        size > data.size
+      end
+
       private
 
       def has_lfs_version_key?
diff --git a/spec/blob_spec.rb b/spec/blob_spec.rb
index 0abb69f..415f346 100644
--- a/spec/blob_spec.rb
+++ b/spec/blob_spec.rb
@@ -83,6 +83,7 @@ describe Gitlab::Git::Blob do
     it { expect(raw_blob.id).to eq(SeedRepo::RubyBlob::ID) }
     it { expect(raw_blob.data[0..10]).to eq("require \'fi") }
     it { expect(raw_blob.size).to eq(669) }
+    it { expect(raw_blob.truncated?).to be_falsey }
 
     context 'large file' do
       it 'limits the size of a large file' do
@@ -93,6 +94,7 @@ describe Gitlab::Git::Blob do
 
         expect(blob.size).to eq(blob_size)
         expect(blob.data.length).to eq(Gitlab::Git::Blob::MAX_DATA_DISPLAY_SIZE)
+        expect(blob.truncated?).to be_truthy
       end
     end
   end
-- 
GitLab