Skip to content
Snippets Groups Projects
Commit 62927165 authored by Douwe Maan's avatar Douwe Maan
Browse files

Merge branch 'ruby-gitattributes-parser' into 'master'

Parse Git attribute files using Ruby

Commit 340e111e contains all the details. It's quite the read so the short summary is:

> Rugged is slow as heck because it runs multiple IO calls every time you request a set of Git attributes. gitlab_git now provides a pure Ruby parser that avoids this and is between 4 and 6 times faster.

Here's a Grafana screenshot to show how bad it can get:

![timings](/uploads/39f7b6b7b6a8d97f2b11a20a088988e4/timings.jpg)

See https://gitlab.com/gitlab-org/gitlab-ce/issues/10785 for more information.

See merge request !121
parents b205c79e 340e111e
No related branches found
No related tags found
1 merge request!121Parse Git attribute files using Ruby
Pipeline #
v 10.6.0
- Use a pure Ruby Git attributes file parser to drastically reduce time spent in parsing Git attributes
v 10.5.0
- Add Repository#find_branch to speed up branch lookups
- Add Repository#reload_rugged to force a refresh of the Rugged repository
Loading
Loading
Loading
Loading
@@ -25,3 +25,4 @@ require_relative "gitlab_git/ref"
require_relative "gitlab_git/branch"
require_relative "gitlab_git/tag"
require_relative "gitlab_git/util"
require_relative "gitlab_git/attributes"
module Gitlab
module Git
# Class for parsing Git attribute files and extracting the attributes for
# file patterns.
#
# Unlike Rugged this parser only needs a single IO call (a call to `open`),
# vastly reducing the time spent in extracting attributes.
#
# This class _only_ supports parsing the attributes file located at
# `$GIT_DIR/info/attributes` as GitLab doesn't use any other files
# (`.gitattributes` is copied to this particular path).
#
# Basic usage:
#
# attributes = Gitlab::Git::Attributes.new(some_repo.path)
#
# attributes.attributes('README.md') # => { "eol" => "lf }
class Attributes
# path - The path to the Git repository.
def initialize(path)
@path = path
@patterns = nil
end
# Returns all the Git attributes for the given path.
#
# path - A path to a file for which to get the attributes.
#
# Returns a Hash.
def attributes(path)
patterns.each do |pattern, attrs|
return attrs if File.fnmatch(pattern, path)
end
{}
end
# Returns a Hash containing the file patterns and their attributes.
def patterns
@patterns ||= parse_file
end
# Parses an attribute string.
#
# These strings can be in the following formats:
#
# text # => { "text" => true }
# -text # => { "text" => false }
# key=value # => { "key" => "value" }
#
# string - The string to parse.
#
# Returns a Hash containing the attributes and their values.
def parse_attributes(string)
values = {}
dash = '-'
equal = '='
string.split(/\s+/).each do |chunk|
# Data such as "foo = bar" should be treated as "foo" and "bar" being
# separate boolean attributes.
next if chunk == equal
# Input: "-foo"
if chunk.start_with?(dash)
key = chunk.byteslice(1, chunk.length - 1)
values[key] = false
# Input: "foo=bar"
elsif chunk.include?(equal)
key, value = chunk.split(equal, 2)
values[key] = value
# Input: "foo"
else
values[chunk] = true
end
end
values
end
# Iterates over every line in the attributes file.
def each_line
full_path = File.join(@path, 'info/attributes')
File.open(full_path, 'r') do |handle|
handle.each_line do |line|
yield line.strip
end
end
end
private
# Parses the Git attributes file.
def parse_file
pairs = []
comment = '#'
each_line do |line|
next if line.start_with?(comment) || line.empty?
pattern, attrs = line.split(/\s+/, 2)
pairs << [pattern, parse_attributes(attrs)]
end
# Newer entries take precedence over older entries.
pairs.reverse.to_h
end
end
end
end
Loading
Loading
@@ -27,14 +27,12 @@ module Gitlab
# Rugged repo object
attr_reader :rugged
 
# Define a delegator for the rugged attributes
def_delegator :rugged, :attributes
# 'path' must be the path to a _bare_ git repository, e.g.
# /path/to/my-repo.git
def initialize(path)
@path = path
@name = path.split("/").last
@attributes = Attributes.new(path)
end
 
# Default branch in the repository
Loading
Loading
@@ -978,8 +976,14 @@ module Gitlab
 
# Checks if the blob should be diffable according to its attributes
def diffable?(blob)
blob_attributes = attributes(blob.path).to_h
blob_attributes.fetch('diff', blob.text?)
attributes(blob.path).fetch('diff') { blob.text? }
end
# Returns the Git attributes for the given file path.
#
# See `Gitlab::Git::Attributes` for more information.
def attributes(path)
@attributes.attributes(path)
end
 
private
Loading
Loading
require 'spec_helper'
describe Gitlab::Git::Attributes do
let(:path) { File.join(SUPPORT_PATH, 'with-git-attributes.git') }
subject { described_class.new(path) }
describe '#attributes' do
context 'using a path with attributes' do
it 'returns the attributes as a Hash' do
expect(subject.attributes('test.txt')).to eq({ 'text' => true })
end
it 'returns a Hash containing multiple attributes' do
expect(subject.attributes('test.sh')).
to eq({ 'eol' => 'lf', 'gitlab-language' => 'shell' })
end
it 'returns a Hash containing attributes for a file with multiple extensions' do
expect(subject.attributes('test.haml.html')).
to eq({ 'gitlab-language' => 'haml' })
end
it 'returns a Hash containing attributes for a file in a directory' do
expect(subject.attributes('foo/bar.txt')).to eq({ 'foo' => true })
end
it 'returns a Hash containing attributes with query string parameters' do
expect(subject.attributes('foo.cgi')).
to eq({ 'key' => 'value?p1=v1&p2=v2' })
end
end
context 'using a path without any attributes' do
it 'returns an empty Hash' do
expect(subject.attributes('test.foo')).to eq({})
end
end
end
describe '#patterns' do
it 'parses a file with entries' do
expect(subject.patterns).to be_an_instance_of(Hash)
end
it 'parses an entry that uses a tab to separate the pattern and attributes' do
expect(subject.patterns['*.md']).
to eq({ 'gitlab-language' => 'markdown' })
end
it 'stores patterns in reverse order' do
first = subject.patterns.to_a[0]
expect(first[0]).to eq('*.md')
end
# It's a bit hard to test for something _not_ being processed. As such we'll
# just test the number of entries.
it 'ignores any comments and empty lines' do
expect(subject.patterns.length).to eq(7)
end
end
describe '#parse_attributes' do
it 'parses a boolean attribute' do
expect(subject.parse_attributes('text')).to eq({ 'text' => true })
end
it 'parses a negated boolean attribute' do
expect(subject.parse_attributes('-text')).to eq({ 'text' => false })
end
it 'parses a key-value pair' do
expect(subject.parse_attributes('foo=bar')).to eq({ 'foo' => 'bar' })
end
it 'parses multiple attributes' do
input = 'boolean key=value -negated'
expect(subject.parse_attributes(input)).
to eq({ 'boolean' => true, 'key' => 'value', 'negated' => false })
end
it 'parses attributes with query string parameters' do
expect(subject.parse_attributes('foo=bar?baz=1')).
to eq({ 'foo' => 'bar?baz=1' })
end
end
describe '#each_line' do
it 'iterates over every line in the attributes file' do
args = [String] * 11 # the number of lines in the file
expect { |b| subject.each_line(&b) }.to yield_successive_args(*args)
end
end
end
Loading
Loading
@@ -11,6 +11,7 @@ module SeedHelper
create_bare_seeds
create_normal_seeds
create_mutable_seeds
create_git_attributes
end
 
def create_bare_seeds
Loading
Loading
@@ -38,6 +39,28 @@ module SeedHelper
chdir: TEST_MUTABLE_REPO_PATH, out: '/dev/null', err: '/dev/null')
end
 
def create_git_attributes
dir = File.join(SUPPORT_PATH, 'with-git-attributes.git', 'info')
FileUtils.mkdir_p(dir)
File.open(File.join(dir, 'attributes'), 'w') do |handle|
handle.write <<-EOF.strip
# This is a comment, it should be ignored.
*.txt text
*.jpg -text
*.sh eol=lf gitlab-language=shell
*.haml.* gitlab-language=haml
foo/bar.* foo
*.cgi key=value?p1=v1&p2=v2
# This uses a tab instead of spaces to ensure the parser also supports this.
*.md\tgitlab-language=markdown
EOF
end
end
# Prevent developer git configurations from being persisted to test
# repositories
def git_env
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment