From af1f6844c98bfb4adda1c20dc75b808f031a4256 Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Thu, 29 Jun 2017 15:37:37 +0200
Subject: [PATCH] Added code for defining SHA attributes

These attributes are stored in binary in the database, but exposed as
strings. This allows one to query/create data using plain SHA1 hashes as
Strings, while storing them more efficiently as binary.
---
 app/models/concerns/sha_attribute.rb          | 18 ++++++++++
 doc/development/README.md                     |  1 +
 doc/development/sha1_as_binary.md             | 36 +++++++++++++++++++
 lib/gitlab/database/sha_attribute.rb          | 34 ++++++++++++++++++
 .../lib/gitlab/database/sha_attribute_spec.rb | 33 +++++++++++++++++
 spec/models/concerns/sha_attribute_spec.rb    | 27 ++++++++++++++
 6 files changed, 149 insertions(+)
 create mode 100644 app/models/concerns/sha_attribute.rb
 create mode 100644 doc/development/sha1_as_binary.md
 create mode 100644 lib/gitlab/database/sha_attribute.rb
 create mode 100644 spec/lib/gitlab/database/sha_attribute_spec.rb
 create mode 100644 spec/models/concerns/sha_attribute_spec.rb

diff --git a/app/models/concerns/sha_attribute.rb b/app/models/concerns/sha_attribute.rb
new file mode 100644
index 00000000000..c28974a3cdf
--- /dev/null
+++ b/app/models/concerns/sha_attribute.rb
@@ -0,0 +1,18 @@
+module ShaAttribute
+  extend ActiveSupport::Concern
+
+  module ClassMethods
+    def sha_attribute(name)
+      column = columns.find { |c| c.name == name.to_s }
+
+      # In case the table doesn't exist we won't be able to find the column,
+      # thus we will only check the type if the column is present.
+      if column && column.type != :binary
+        raise ArgumentError,
+          "sha_attribute #{name.inspect} is invalid since the column type is not :binary"
+      end
+
+      attribute(name, Gitlab::Database::ShaAttribute.new)
+    end
+  end
+end
diff --git a/doc/development/README.md b/doc/development/README.md
index 9496a87d84d..a2a07c37ced 100644
--- a/doc/development/README.md
+++ b/doc/development/README.md
@@ -54,6 +54,7 @@
 - [Polymorphic Associations](polymorphic_associations.md)
 - [Single Table Inheritance](single_table_inheritance.md)
 - [Background Migrations](background_migrations.md)
+- [Storing SHA1 Hashes As Binary](sha1_as_binary.md)
 
 ## i18n
 
diff --git a/doc/development/sha1_as_binary.md b/doc/development/sha1_as_binary.md
new file mode 100644
index 00000000000..3151cc29bbc
--- /dev/null
+++ b/doc/development/sha1_as_binary.md
@@ -0,0 +1,36 @@
+# Storing SHA1 Hashes As Binary
+
+Storing SHA1 hashes as strings is not very space efficient. A SHA1 as a string
+requires at least 40 bytes, an additional byte to store the encoding, and
+perhaps more space depending on the internals of PostgreSQL and MySQL.
+
+On the other hand, if one were to store a SHA1 as binary one would only need 20
+bytes for the actual SHA1, and 1 or 4 bytes of additional space (again depending
+on database internals). This means that in the best case scenario we can reduce
+the space usage by 50%.
+
+To make this easier to work with you can include the concern `ShaAttribute` into
+a model and define a SHA attribute using the `sha_attribute` class method. For
+example:
+
+```ruby
+class Commit < ActiveRecord::Base
+  include ShaAttribute
+
+  sha_attribute :sha
+end
+```
+
+This allows you to use the value of the `sha` attribute as if it were a string,
+while storing it as binary. This means that you can do something like this,
+without having to worry about converting data to the right binary format:
+
+```ruby
+commit = Commit.find_by(sha: '88c60307bd1f215095834f09a1a5cb18701ac8ad')
+commit.sha = '971604de4cfa324d91c41650fabc129420c8d1cc'
+commit.save
+```
+
+There is however one requirement: the column used to store the SHA has _must_ be
+a binary type. For Rails this means you need to use the `:binary` type instead
+of `:text` or `:string`.
diff --git a/lib/gitlab/database/sha_attribute.rb b/lib/gitlab/database/sha_attribute.rb
new file mode 100644
index 00000000000..d9400e04b83
--- /dev/null
+++ b/lib/gitlab/database/sha_attribute.rb
@@ -0,0 +1,34 @@
+module Gitlab
+  module Database
+    BINARY_TYPE = if Gitlab::Database.postgresql?
+                    # PostgreSQL defines its own class with slightly different
+                    # behaviour from the default Binary type.
+                    ActiveRecord::ConnectionAdapters::PostgreSQL::OID::Bytea
+                  else
+                    ActiveRecord::Type::Binary
+                  end
+
+    # Class for casting binary data to hexadecimal SHA1 hashes (and vice-versa).
+    #
+    # Using ShaAttribute allows you to store SHA1 values as binary while still
+    # using them as if they were stored as string values. This gives you the
+    # ease of use of string values, but without the storage overhead.
+    class ShaAttribute < BINARY_TYPE
+      PACK_FORMAT = 'H*'.freeze
+
+      # Casts binary data to a SHA1 in hexadecimal.
+      def type_cast_from_database(value)
+        value = super
+
+        value ? value.unpack(PACK_FORMAT)[0] : nil
+      end
+
+      # Casts a SHA1 in hexadecimal to the proper binary format.
+      def type_cast_for_database(value)
+        arg = value ? [value].pack(PACK_FORMAT) : nil
+
+        super(arg)
+      end
+    end
+  end
+end
diff --git a/spec/lib/gitlab/database/sha_attribute_spec.rb b/spec/lib/gitlab/database/sha_attribute_spec.rb
new file mode 100644
index 00000000000..62c1d37ea1c
--- /dev/null
+++ b/spec/lib/gitlab/database/sha_attribute_spec.rb
@@ -0,0 +1,33 @@
+require 'spec_helper'
+
+describe Gitlab::Database::ShaAttribute do
+  let(:sha) do
+    '9a573a369a5bfbb9a4a36e98852c21af8a44ea8b'
+  end
+
+  let(:binary_sha) do
+    [sha].pack('H*')
+  end
+
+  let(:binary_from_db) do
+    if Gitlab::Database.postgresql?
+      "\\x#{sha}"
+    else
+      binary_sha
+    end
+  end
+
+  let(:attribute) { described_class.new }
+
+  describe '#type_cast_from_database' do
+    it 'converts the binary SHA to a String' do
+      expect(attribute.type_cast_from_database(binary_from_db)).to eq(sha)
+    end
+  end
+
+  describe '#type_cast_for_database' do
+    it 'converts a SHA String to binary data' do
+      expect(attribute.type_cast_for_database(sha).to_s).to eq(binary_sha)
+    end
+  end
+end
diff --git a/spec/models/concerns/sha_attribute_spec.rb b/spec/models/concerns/sha_attribute_spec.rb
new file mode 100644
index 00000000000..9e37c2b20c4
--- /dev/null
+++ b/spec/models/concerns/sha_attribute_spec.rb
@@ -0,0 +1,27 @@
+require 'spec_helper'
+
+describe ShaAttribute do
+  let(:model) { Class.new { include ShaAttribute } }
+
+  before do
+    columns = [
+      double(:column, name: 'name', type: :text),
+      double(:column, name: 'sha1', type: :binary)
+    ]
+
+    allow(model).to receive(:columns).and_return(columns)
+  end
+
+  describe '#sha_attribute' do
+    it 'defines a SHA attribute for a binary column' do
+      expect(model).to receive(:attribute)
+        .with(:sha1, an_instance_of(Gitlab::Database::ShaAttribute))
+
+      model.sha_attribute(:sha1)
+    end
+
+    it 'raises ArgumentError when the column type is not :binary' do
+      expect { model.sha_attribute(:name) }.to raise_error(ArgumentError)
+    end
+  end
+end
-- 
GitLab