Skip to content
Snippets Groups Projects
Commit 90886008 authored by Douwe Maan's avatar Douwe Maan
Browse files

Merge branch '30093-apply-bfg-object-map-to-database' into 'master'

Remove cleaned up OIDs from database and cache

Closes #30093

See merge request gitlab-org/gitlab-ce!26555
parents 0cc30909 8973f32d
No related branches found
No related tags found
No related merge requests found
Showing
with 302 additions and 123 deletions
1.40.0
1.41.0
Loading
Loading
@@ -51,6 +51,10 @@ class MergeRequestDiff < ApplicationRecord
joins(:merge_request_diff_commits).where(merge_request_diff_commits: { sha: sha }).reorder(nil)
end
 
scope :by_project_id, -> (project_id) do
joins(:merge_request).where(merge_requests: { target_project_id: project_id })
end
scope :recent, -> { order(id: :desc).limit(100) }
scope :files_in_database, -> { where(stored_externally: [false, nil]) }
 
Loading
Loading
Loading
Loading
@@ -7,6 +7,10 @@ class NoteDiffFile < ApplicationRecord
joins(:diff_note).where("resolved_at IS NULL OR noteable_type = 'Commit'")
end
 
scope :referencing_sha, -> (oids, project_id:) do
joins(:diff_note).where(notes: { project_id: project_id, commit_id: oids })
end
delegate :original_position, :project, to: :diff_note
 
belongs_to :diff_note, inverse_of: :note_diff_file
Loading
Loading
Loading
Loading
@@ -18,9 +18,6 @@ module Projects
# per rewritten object, with the old and new SHAs space-separated. It can be
# used to update or remove content that references the objects that BFG has
# altered
#
# Currently, only the project repository is modified by this service, but we
# may wish to modify other data sources in the future.
def execute
apply_bfg_object_map!
 
Loading
Loading
@@ -41,10 +38,52 @@ module Projects
raise NoUploadError unless project.bfg_object_map.exists?
 
project.bfg_object_map.open do |io|
repository_cleaner.apply_bfg_object_map(io)
repository_cleaner.apply_bfg_object_map_stream(io) do |response|
cleanup_diffs(response)
end
end
end
def cleanup_diffs(response)
old_commit_shas = extract_old_commit_shas(response.entries)
ActiveRecord::Base.transaction do
cleanup_merge_request_diffs(old_commit_shas)
cleanup_note_diff_files(old_commit_shas)
end
end
 
def extract_old_commit_shas(batch)
batch.lazy.select { |entry| entry.type == :COMMIT }.map(&:old_oid).force
end
def cleanup_merge_request_diffs(old_commit_shas)
merge_request_diffs = MergeRequestDiff
.by_project_id(project.id)
.by_commit_sha(old_commit_shas)
# It's important to run the ActiveRecord callbacks here
merge_request_diffs.destroy_all # rubocop:disable Cop/DestroyAll
# TODO: ensure the highlight cache is removed immediately. It's too hard
# to calculate the Redis keys at present.
#
# https://gitlab.com/gitlab-org/gitlab-ce/issues/61115
end
def cleanup_note_diff_files(old_commit_shas)
# Pluck the IDs instead of running the query twice to ensure we clear the
# cache for exactly the note diffs we remove
ids = NoteDiffFile
.referencing_sha(old_commit_shas, project_id: project.id)
.pluck_primary_key
NoteDiffFile.id_in(ids).delete_all
# A highlighted version of the diff is stored in redis. Remove it now.
Gitlab::DiscussionsDiff::HighlightCache.clear_multiple(ids)
end
def repository_cleaner
@repository_cleaner ||= Gitlab::Git::RepositoryCleaner.new(repository.raw)
end
Loading
Loading
---
title: Remove cleaned up OIDs from database and cache
merge_request: 26555
author:
type: added
Loading
Loading
@@ -98,6 +98,12 @@ up its own internal state, maximizing the space saved.
`git gc` against the repository. You will receive an email once it has
completed.
 
This process will remove some copies of the rewritten commits from GitLab's
cache and database, but there are still numerous gaps in coverage - at present,
some of the copies may persist indefinitely. [Clearing the instance cache]
(../../../administration/raketasks/maintenance.md#clear-redis-cache) may help to
remove some of them, but it should not be depended on for security purposes!
## Using `git filter-branch`
 
1. Navigate to your repository:
Loading
Loading
Loading
Loading
@@ -52,6 +52,19 @@ module Gitlab
end
end
 
# Clears multiple cache keys at once.
#
# raw_keys - An Array of unique cache keys, without namespaces.
#
# It returns the number of cache keys cleared. Ex.: 42
def clear_multiple(raw_keys)
return [] if raw_keys.empty?
keys = raw_keys.map { |id| cache_key_for(id) }
Redis::Cache.with { |redis| redis.del(keys) }
end
def cache_key_for(raw_key)
"#{cache_key_prefix}:#{raw_key}"
end
Loading
Loading
Loading
Loading
@@ -12,9 +12,9 @@ module Gitlab
@repository = repository
end
 
def apply_bfg_object_map(io)
def apply_bfg_object_map_stream(io, &blk)
wrapped_gitaly_errors do
gitaly_cleanup_client.apply_bfg_object_map(io)
gitaly_cleanup_client.apply_bfg_object_map_stream(io, &blk)
end
end
 
Loading
Loading
Loading
Loading
@@ -12,25 +12,32 @@ module Gitlab
@storage = repository.storage
end
 
def apply_bfg_object_map(io)
first_request = Gitaly::ApplyBfgObjectMapRequest.new(repository: gitaly_repo)
def apply_bfg_object_map_stream(io, &blk)
responses = GitalyClient.call(
storage,
:cleanup_service,
:apply_bfg_object_map_stream,
build_object_map_enum(io),
timeout: GitalyClient.no_timeout
)
responses.each(&blk)
end
private
 
enum = Enumerator.new do |y|
y.yield first_request
def build_object_map_enum(io)
Enumerator.new do |y|
# First request. For simplicity, doesn't include any object map data
y << Gitaly::ApplyBfgObjectMapStreamRequest.new(repository: gitaly_repo)
 
# Now stream the BFG object map file to gitaly in chunks
while data = io.read(RepositoryService::MAX_MSG_SIZE)
y.yield Gitaly::ApplyBfgObjectMapRequest.new(object_map: data)
y << Gitaly::ApplyBfgObjectMapStreamRequest.new(object_map: data)
break if io&.eof?
end
end
GitalyClient.call(
storage,
:cleanup_service,
:apply_bfg_object_map,
enum,
timeout: GitalyClient.no_timeout
)
end
end
end
Loading
Loading
Loading
Loading
@@ -3,31 +3,32 @@
require 'spec_helper'
 
describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
def fake_file(offset)
{
text: 'foo',
type: 'new',
index: 2 + offset,
old_pos: 10 + offset,
new_pos: 11 + offset,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
}
end
let(:mapping) do
{
3 => [
fake_file(0),
fake_file(1)
],
4 => [
fake_file(2)
]
}
end
describe '#write_multiple' do
it 'sets multiple keys serializing content as JSON' do
mapping = {
3 => [
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 10,
new_pos: 11,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
},
{
text: 'foo',
type: 'new',
index: 3,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blops>blips</blops>'
}
]
}
described_class.write_multiple(mapping)
 
mapping.each do |key, value|
Loading
Loading
@@ -41,53 +42,16 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
 
describe '#read_multiple' do
it 'reads multiple keys and serializes content into Gitlab::Diff::Line objects' do
mapping = {
3 => [
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
},
{
text: 'foo',
type: 'new',
index: 3,
old_pos: 10,
new_pos: 11,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
}
]
}
described_class.write_multiple(mapping)
 
found = described_class.read_multiple(mapping.keys)
 
expect(found.size).to eq(1)
expect(found.size).to eq(2)
expect(found.first.size).to eq(2)
expect(found.first).to all(be_a(Gitlab::Diff::Line))
end
 
it 'returns nil when cached key is not found' do
mapping = {
3 => [
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
}
]
}
described_class.write_multiple(mapping)
 
found = described_class.read_multiple([2, 3])
Loading
Loading
@@ -95,8 +59,30 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
expect(found.size).to eq(2)
 
expect(found.first).to eq(nil)
expect(found.second.size).to eq(1)
expect(found.second.size).to eq(2)
expect(found.second).to all(be_a(Gitlab::Diff::Line))
end
end
describe '#clear_multiple' do
it 'removes all named keys' do
described_class.write_multiple(mapping)
described_class.clear_multiple(mapping.keys)
expect(described_class.read_multiple(mapping.keys)).to all(be_nil)
end
it 'only removed named keys' do
to_clear, to_leave = mapping.keys
described_class.write_multiple(mapping)
described_class.clear_multiple([to_clear])
cleared, left = described_class.read_multiple([to_clear, to_leave])
expect(cleared).to be_nil
expect(left).to all(be_a(Gitlab::Diff::Line))
end
end
end
Loading
Loading
@@ -6,55 +6,62 @@ describe Gitlab::Git::RepositoryCleaner do
let(:project) { create(:project, :repository) }
let(:repository) { project.repository }
let(:head_sha) { repository.head_commit.id }
let(:object_map_data) { "#{head_sha} #{'0' * 40}" }
let(:object_map_data) { "#{head_sha} #{Gitlab::Git::BLANK_SHA}" }
 
subject(:cleaner) { described_class.new(repository.raw) }
let(:clean_refs) { %W[refs/environments/1 refs/merge-requests/1 refs/keep-around/#{head_sha}] }
let(:keep_refs) { %w[refs/heads/_keep refs/tags/_keep] }
 
describe '#apply_bfg_object_map' do
let(:clean_refs) { %W[refs/environments/1 refs/merge-requests/1 refs/keep-around/#{head_sha}] }
let(:keep_refs) { %w[refs/heads/_keep refs/tags/_keep] }
subject(:cleaner) { described_class.new(repository.raw) }
 
shared_examples_for '#apply_bfg_object_map_stream' do
before do
(clean_refs + keep_refs).each { |ref| repository.create_ref(head_sha, ref) }
end
 
context 'from StringIO' do
let(:object_map) { StringIO.new(object_map_data) }
it 'removes internal references' do
entries = []
 
it 'removes internal references' do
cleaner.apply_bfg_object_map(object_map)
cleaner.apply_bfg_object_map_stream(object_map) do |rsp|
entries.concat(rsp.entries)
end
 
aggregate_failures do
clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_falsy }
keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_truthy }
end
aggregate_failures do
clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be(false) }
keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be(true) }
expect(entries).to contain_exactly(
Gitaly::ApplyBfgObjectMapStreamResponse::Entry.new(
type: :COMMIT,
old_oid: head_sha,
new_oid: Gitlab::Git::BLANK_SHA
)
)
end
end
end
 
context 'from Gitlab::HttpIO' do
let(:url) { 'http://example.com/bfg_object_map.txt' }
let(:tempfile) { Tempfile.new }
let(:object_map) { Gitlab::HttpIO.new(url, object_map_data.size) }
describe '#apply_bfg_object_map_stream (from StringIO)' do
let(:object_map) { StringIO.new(object_map_data) }
 
around do |example|
tempfile.write(object_map_data)
tempfile.close
include_examples '#apply_bfg_object_map_stream'
end
 
example.run
ensure
tempfile.unlink
end
describe '#apply_bfg_object_map_stream (from Gitlab::HttpIO)' do
let(:url) { 'http://example.com/bfg_object_map.txt' }
let(:tempfile) { Tempfile.new }
let(:object_map) { Gitlab::HttpIO.new(url, object_map_data.size) }
 
it 'removes internal references' do
stub_remote_url_200(url, tempfile.path)
around do |example|
tempfile.write(object_map_data)
tempfile.close
 
cleaner.apply_bfg_object_map(object_map)
stub_remote_url_200(url, tempfile.path)
 
aggregate_failures do
clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_falsy }
keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_truthy }
end
end
example.run
ensure
tempfile.unlink
end
include_examples '#apply_bfg_object_map_stream'
end
end
Loading
Loading
@@ -6,14 +6,14 @@ describe Gitlab::GitalyClient::CleanupService do
let(:relative_path) { project.disk_path + '.git' }
let(:client) { described_class.new(project.repository) }
 
describe '#apply_bfg_object_map' do
it 'sends an apply_bfg_object_map message' do
describe '#apply_bfg_object_map_stream' do
it 'sends an apply_bfg_object_map_stream message' do
expect_any_instance_of(Gitaly::CleanupService::Stub)
.to receive(:apply_bfg_object_map)
.to receive(:apply_bfg_object_map_stream)
.with(kind_of(Enumerator), kind_of(Hash))
.and_return(double)
.and_return([])
 
client.apply_bfg_object_map(StringIO.new)
client.apply_bfg_object_map_stream(StringIO.new)
end
end
end
Loading
Loading
@@ -10,4 +10,31 @@ describe NoteDiffFile do
describe 'validations' do
it { is_expected.to validate_presence_of(:diff_note) }
end
describe '.referencing_sha' do
let!(:diff_note) { create(:diff_note_on_commit) }
let(:note_diff_file) { diff_note.note_diff_file }
let(:project) { diff_note.project }
it 'finds note diff files by project and sha' do
found = described_class.referencing_sha(diff_note.commit_id, project_id: project.id)
expect(found).to contain_exactly(note_diff_file)
end
it 'excludes note diff files with the wrong project' do
other_project = create(:project)
found = described_class.referencing_sha(diff_note.commit_id, project_id: other_project.id)
expect(found).to be_empty
end
it 'excludes note diff files with the wrong sha' do
found = described_class.referencing_sha(Gitlab::Git::BLANK_SHA, project_id: project.id)
expect(found).to be_empty
end
end
end
Loading
Loading
@@ -6,13 +6,13 @@ describe Projects::CleanupService do
let(:project) { create(:project, :repository, bfg_object_map: fixture_file_upload('spec/fixtures/bfg_object_map.txt')) }
let(:object_map) { project.bfg_object_map }
 
let(:cleaner) { service.__send__(:repository_cleaner) }
subject(:service) { described_class.new(project) }
 
describe '#execute' do
it 'runs the apply_bfg_object_map gitaly RPC' do
expect_next_instance_of(Gitlab::Git::RepositoryCleaner) do |cleaner|
expect(cleaner).to receive(:apply_bfg_object_map).with(kind_of(IO))
end
it 'runs the apply_bfg_object_map_stream gitaly RPC' do
expect(cleaner).to receive(:apply_bfg_object_map_stream).with(kind_of(IO))
 
service.execute
end
Loading
Loading
@@ -37,10 +37,91 @@ describe Projects::CleanupService do
expect(object_map.exists?).to be_falsy
end
 
context 'with a tainted merge request diff' do
let(:merge_request) { create(:merge_request, source_project: project, target_project: project) }
let(:diff) { merge_request.merge_request_diff }
let(:entry) { build_entry(diff.commits.first.id) }
before do
allow(cleaner)
.to receive(:apply_bfg_object_map_stream)
.and_yield(Gitaly::ApplyBfgObjectMapStreamResponse.new(entries: [entry]))
end
it 'removes the tainted commit from the database' do
service.execute
expect(MergeRequestDiff.exists?(diff.id)).to be_falsy
end
it 'ignores non-commit responses from Gitaly' do
entry.type = :UNKNOWN
service.execute
expect(MergeRequestDiff.exists?(diff.id)).to be_truthy
end
end
context 'with a tainted diff note' do
let(:diff_note) { create(:diff_note_on_commit, project: project) }
let(:note_diff_file) { diff_note.note_diff_file }
let(:entry) { build_entry(diff_note.commit_id) }
let(:highlight_cache) { Gitlab::DiscussionsDiff::HighlightCache }
let(:cache_id) { note_diff_file.id }
before do
allow(cleaner)
.to receive(:apply_bfg_object_map_stream)
.and_yield(Gitaly::ApplyBfgObjectMapStreamResponse.new(entries: [entry]))
end
it 'removes the tainted commit from the database' do
service.execute
expect(NoteDiffFile.exists?(note_diff_file.id)).to be_falsy
end
it 'removes the highlight cache from redis' do
write_cache(highlight_cache, cache_id, [{}])
expect(read_cache(highlight_cache, cache_id)).not_to be_nil
service.execute
expect(read_cache(highlight_cache, cache_id)).to be_nil
end
it 'ignores non-commit responses from Gitaly' do
entry.type = :UNKNOWN
service.execute
expect(NoteDiffFile.exists?(note_diff_file.id)).to be_truthy
end
end
it 'raises an error if no object map can be found' do
object_map.remove!
 
expect { service.execute }.to raise_error(described_class::NoUploadError)
end
end
def build_entry(old_oid)
Gitaly::ApplyBfgObjectMapStreamResponse::Entry.new(
type: :COMMIT,
old_oid: old_oid,
new_oid: Gitlab::Git::BLANK_SHA
)
end
def read_cache(cache, key)
cache.read_multiple([key]).first
end
def write_cache(cache, key, value)
cache.write_multiple(key => value)
end
end
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment