Skip to content
Snippets Groups Projects
Commit f2d50af9 authored by Sean McGivern's avatar Sean McGivern
Browse files

Migrate MR commits and diffs to new tables

Previously, we stored these as serialised fields - `st_{commits,diffs}` - on the
`merge_request_diffs` table. These now have their own tables -
`merge_request_diff_{commits,diffs}` - with a column for each attribute of the
serialised data.

Add a background migration to go through the existing MR diffs and migrate them
to the new format. Ignore any contents that cannot be displayed. Assuming that
we have 5 million rows to migrate, and each batch of 2,500 rows can be
completed in 5 minutes, this will take about 7 days to migrate everything.
parent 1018ab05
No related branches found
No related tags found
No related merge requests found
Loading
Loading
@@ -85,11 +85,7 @@ class MergeRequestDiff < ActiveRecord::Base
 
def raw_diffs(options = {})
if options[:ignore_whitespace_change]
@diffs_no_whitespace ||=
Gitlab::Git::Compare.new(
repository.raw_repository,
safe_start_commit_sha,
head_commit_sha).diffs(options)
@diffs_no_whitespace ||= compare.diffs(options)
else
@raw_diffs ||= {}
@raw_diffs[options] ||= load_diffs(options)
Loading
Loading
class ScheduleMergeRequestDiffMigrations < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
BATCH_SIZE = 2500
MIGRATION = 'DeserializeMergeRequestDiffsAndCommits'
disable_ddl_transaction!
class MergeRequestDiff < ActiveRecord::Base
self.table_name = 'merge_request_diffs'
include ::EachBatch
end
# Assuming that there are 5 million rows affected (which is more than on
# GitLab.com), and that each batch of 2,500 rows takes up to 5 minutes, then
# we can migrate all the rows in 7 days.
#
# On staging, plucking the IDs themselves takes 5 seconds.
def up
non_empty = 'st_commits IS NOT NULL OR st_diffs IS NOT NULL'
MergeRequestDiff.where(non_empty).each_batch(of: BATCH_SIZE) do |relation, index|
range = relation.pluck('MIN(id)', 'MAX(id)').first
BackgroundMigrationWorker.perform_in(index * 5.minutes, MIGRATION, range)
end
end
def down
end
end
module Gitlab
module BackgroundMigration
class DeserializeMergeRequestDiffsAndCommits
attr_reader :diff_ids, :commit_rows, :file_rows
class MergeRequestDiff < ActiveRecord::Base
self.table_name = 'merge_request_diffs'
end
BUFFER_ROWS = 1000
def perform(start_id, stop_id)
merge_request_diffs = MergeRequestDiff
.select(:id, :st_commits, :st_diffs)
.where('st_commits IS NOT NULL OR st_diffs IS NOT NULL')
.where(id: start_id..stop_id)
reset_buffers!
merge_request_diffs.each do |merge_request_diff|
commits, files = single_diff_rows(merge_request_diff)
diff_ids << merge_request_diff.id
commit_rows.concat(commits)
file_rows.concat(files)
if diff_ids.length > BUFFER_ROWS ||
commit_rows.length > BUFFER_ROWS ||
file_rows.length > BUFFER_ROWS
flush_buffers!
end
end
flush_buffers!
end
private
def reset_buffers!
@diff_ids = []
@commit_rows = []
@file_rows = []
end
def flush_buffers!
if diff_ids.any?
MergeRequestDiff.transaction do
Gitlab::Database.bulk_insert('merge_request_diff_commits', commit_rows)
Gitlab::Database.bulk_insert('merge_request_diff_files', file_rows)
MergeRequestDiff.where(id: diff_ids).update_all(st_commits: nil, st_diffs: nil)
end
end
reset_buffers!
end
def single_diff_rows(merge_request_diff)
sha_attribute = Gitlab::Database::ShaAttribute.new
commits = YAML.load(merge_request_diff.st_commits) rescue []
commit_rows = commits.map.with_index do |commit, index|
commit_hash = commit.to_hash.with_indifferent_access.except(:parent_ids)
sha = commit_hash.delete(:id)
commit_hash.merge(
merge_request_diff_id: merge_request_diff.id,
relative_order: index,
sha: sha_attribute.type_cast_for_database(sha)
)
end
diffs = YAML.load(merge_request_diff.st_diffs) rescue []
diffs = [] unless valid_raw_diffs?(diffs)
file_rows = diffs.map.with_index do |diff, index|
diff_hash = diff.to_hash.with_indifferent_access.merge(
binary: false,
merge_request_diff_id: merge_request_diff.id,
relative_order: index
)
# Compatibility with old diffs created with Psych.
diff_hash.tap do |hash|
diff_text = hash[:diff]
if diff_text.encoding == Encoding::BINARY && !diff_text.ascii_only?
hash[:binary] = true
hash[:diff] = [diff_text].pack('m0')
end
end
end
[commit_rows, file_rows]
end
# Unlike MergeRequestDiff#valid_raw_diff?, don't count Rugged objects as
# valid, because we don't render them usefully anyway.
def valid_raw_diffs?(diffs)
return false unless diffs.respond_to?(:each)
diffs.all? { |diff| diff.is_a?(Hash) }
end
end
end
end
require 'spec_helper'
describe Gitlab::BackgroundMigration::DeserializeMergeRequestDiffsAndCommits do
describe '#perform' do
set(:merge_request) { create(:merge_request) }
set(:merge_request_diff) { merge_request.merge_request_diff }
let(:updated_merge_request_diff) { MergeRequestDiff.find(merge_request_diff.id) }
def diffs_to_hashes(diffs)
diffs.as_json(only: Gitlab::Git::Diff::SERIALIZE_KEYS).map(&:with_indifferent_access)
end
def quote_yaml(value)
MergeRequestDiff.connection.quote(YAML.dump(value))
end
def convert_to_yaml(merge_request_diff_id, commits, diffs)
MergeRequestDiff.where(id: merge_request_diff_id).update_all(
"st_commits = #{quote_yaml(commits)}, st_diffs = #{quote_yaml(diffs)}"
)
end
shared_examples 'updated MR diff' do
before do
convert_to_yaml(merge_request_diff.id, commits, diffs)
MergeRequestDiffCommit.delete_all
MergeRequestDiffFile.delete_all
subject.perform(merge_request_diff.id, merge_request_diff.id)
end
it 'creates correct entries in the merge_request_diff_commits table' do
expect(updated_merge_request_diff.merge_request_diff_commits.count).to eq(commits.count)
expect(updated_merge_request_diff.commits.map(&:to_hash)).to eq(commits)
end
it 'creates correct entries in the merge_request_diff_files table' do
expect(updated_merge_request_diff.merge_request_diff_files.count).to eq(expected_diffs.count)
expect(diffs_to_hashes(updated_merge_request_diff.raw_diffs)).to eq(expected_diffs)
end
it 'sets the st_commits and st_diffs columns to nil' do
expect(updated_merge_request_diff.st_commits_before_type_cast).to be_nil
expect(updated_merge_request_diff.st_diffs_before_type_cast).to be_nil
end
end
context 'when the diff IDs passed do not exist' do
it 'does not raise' do
expect { subject.perform(0, 0) }.not_to raise_exception
end
end
context 'when the merge request diff has no serialised commits or diffs' do
before do
merge_request_diff.update(st_commits: nil, st_diffs: nil)
end
it 'does not raise' do
expect { subject.perform(merge_request_diff.id, merge_request_diff.id) }
.not_to raise_exception
end
end
context 'processing multiple merge request diffs' do
let(:start_id) { described_class::MergeRequestDiff.minimum(:id) }
let(:stop_id) { described_class::MergeRequestDiff.maximum(:id) }
before do
merge_request.reload_diff(true)
convert_to_yaml(start_id, merge_request_diff.commits, merge_request_diff.diffs)
convert_to_yaml(stop_id, updated_merge_request_diff.commits, updated_merge_request_diff.diffs)
MergeRequestDiffCommit.delete_all
MergeRequestDiffFile.delete_all
end
context 'when BUFFER_ROWS is exceeded' do
before do
stub_const("#{described_class}::BUFFER_ROWS", 1)
end
it 'updates and continues' do
expect(described_class::MergeRequestDiff).to receive(:transaction).twice
subject.perform(start_id, stop_id)
end
end
context 'when BUFFER_ROWS is not exceeded' do
it 'only updates once' do
expect(described_class::MergeRequestDiff).to receive(:transaction).once
subject.perform(start_id, stop_id)
end
end
end
context 'when the merge request diff update fails' do
before do
allow(described_class::MergeRequestDiff)
.to receive(:update_all).and_raise(ActiveRecord::Rollback)
end
it 'does not add any diff commits' do
expect { subject.perform(merge_request_diff.id, merge_request_diff.id) }
.not_to change { MergeRequestDiffCommit.count }
end
it 'does not add any diff files' do
expect { subject.perform(merge_request_diff.id, merge_request_diff.id) }
.not_to change { MergeRequestDiffFile.count }
end
end
context 'when the merge request diff has valid commits and diffs' do
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
let(:diffs) { diffs_to_hashes(merge_request_diff.merge_request_diff_files) }
let(:expected_diffs) { diffs }
include_examples 'updated MR diff'
end
context 'when the merge request diffs have binary content' do
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
let(:expected_diffs) { diffs }
# The start of a PDF created by Illustrator
let(:binary_string) do
"\x25\x50\x44\x46\x2d\x31\x2e\x35\x0d\x25\xe2\xe3\xcf\xd3\x0d\x0a".force_encoding(Encoding::BINARY)
end
let(:diffs) do
[
{
'diff' => binary_string,
'new_path' => 'path',
'old_path' => 'path',
'a_mode' => '100644',
'b_mode' => '100644',
'new_file' => false,
'renamed_file' => false,
'deleted_file' => false,
'too_large' => false
}
]
end
include_examples 'updated MR diff'
end
context 'when the merge request diff has commits, but no diffs' do
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
let(:diffs) { [] }
let(:expected_diffs) { diffs }
include_examples 'updated MR diff'
end
context 'when the merge request diffs have invalid content' do
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
let(:diffs) { ['--broken-diff'] }
let(:expected_diffs) { [] }
include_examples 'updated MR diff'
end
context 'when the merge request diffs are Rugged::Patch instances' do
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
let(:first_commit) { merge_request.project.repository.commit(merge_request_diff.head_commit_sha) }
let(:diffs) { first_commit.diff_from_parent.patches }
let(:expected_diffs) { [] }
include_examples 'updated MR diff'
end
context 'when the merge request diffs are Rugged::Diff::Delta instances' do
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
let(:first_commit) { merge_request.project.repository.commit(merge_request_diff.head_commit_sha) }
let(:diffs) { first_commit.diff_from_parent.deltas }
let(:expected_diffs) { [] }
include_examples 'updated MR diff'
end
end
end
require 'spec_helper'
require Rails.root.join('db', 'post_migrate', '20170703130158_schedule_merge_request_diff_migrations')
describe ScheduleMergeRequestDiffMigrations, :migration, :sidekiq do
matcher :be_scheduled_migration do |time, *expected|
match do |migration|
BackgroundMigrationWorker.jobs.any? do |job|
job['args'] == [migration, expected] &&
job['at'].to_i == time.to_i
end
end
failure_message do |migration|
"Migration `#{migration}` with args `#{expected.inspect}` not scheduled!"
end
end
let(:merge_request_diffs) { table(:merge_request_diffs) }
let(:merge_requests) { table(:merge_requests) }
let(:projects) { table(:projects) }
before do
stub_const("#{described_class.name}::BATCH_SIZE", 1)
projects.create!(id: 1, name: 'gitlab', path: 'gitlab')
merge_requests.create!(id: 1, target_project_id: 1, source_project_id: 1, target_branch: 'feature', source_branch: 'master')
merge_request_diffs.create!(id: 1, merge_request_id: 1, st_commits: YAML.dump([]), st_diffs: nil)
merge_request_diffs.create!(id: 2, merge_request_id: 1, st_commits: nil, st_diffs: YAML.dump([]))
merge_request_diffs.create!(id: 3, merge_request_id: 1, st_commits: nil, st_diffs: nil)
merge_request_diffs.create!(id: 4, merge_request_id: 1, st_commits: YAML.dump([]), st_diffs: YAML.dump([]))
end
it 'correctly schedules background migrations' do
Sidekiq::Testing.fake! do
Timecop.freeze do
migrate!
expect(described_class::MIGRATION).to be_scheduled_migration(5.minutes.from_now, 1, 1)
expect(described_class::MIGRATION).to be_scheduled_migration(10.minutes.from_now, 2, 2)
expect(described_class::MIGRATION).to be_scheduled_migration(15.minutes.from_now, 4, 4)
expect(BackgroundMigrationWorker.jobs.size).to eq 3
end
end
end
it 'schedules background migrations' do
Sidekiq::Testing.inline! do
non_empty = 'st_commits IS NOT NULL OR st_diffs IS NOT NULL'
expect(merge_request_diffs.where(non_empty).count).to eq 3
migrate!
expect(merge_request_diffs.where(non_empty).count).to eq 0
end
end
end
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment