Skip to content
Snippets Groups Projects
Commit edb5cac4 authored by Michael Kozono's avatar Michael Kozono
Browse files

Use bulk inserts

parent 17ce21d7
No related branches found
No related tags found
No related merge requests found
Loading
Loading
@@ -20,7 +20,19 @@ module Gitlab
 
def perform
ensure_temporary_tracking_table_exists
# Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since
# doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be
# slow, start with an empty table for Postgres < 9.5.
# That way we can do bulk inserts at ~30x the speed of individual
# inserts (~20 minutes worth of inserts at GitLab.com scale instead of
# ~10 hours).
# In all other cases, installations will get both bulk inserts and the
# ability for these jobs to retry without having to clear and reinsert.
clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates?
store_untracked_file_paths
schedule_populate_untracked_uploads_jobs
end
 
Loading
Loading
@@ -44,6 +56,10 @@ module Gitlab
end
end
 
def clear_untracked_file_paths
UntrackedFile.delete_all
end
def store_untracked_file_paths
return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR)
 
Loading
Loading
@@ -96,36 +112,35 @@ module Gitlab
end
 
def insert_file_paths(file_paths)
ActiveRecord::Base.transaction do
file_paths.each do |file_path|
insert_file_path(file_path)
end
end
end
sql = if postgresql_pre_9_5?
"INSERT INTO #{table_columns_and_values_for_insert(file_paths)};"
elsif postgresql?
"INSERT INTO #{table_columns_and_values_for_insert(file_paths)} ON CONFLICT DO NOTHING;"
else # MySQL
"INSERT IGNORE INTO #{table_columns_and_values_for_insert(file_paths)};"
end
 
def insert_file_path(file_path)
if postgresql_pre_9_5?
# No easy way to do ON CONFLICT DO NOTHING before Postgres 9.5 so just use Rails
return UntrackedFile.where(path: file_path).first_or_create
end
ActiveRecord::Base.connection.execute(sql)
end
 
table_columns_and_values = 'untracked_files_for_uploads (path, created_at, updated_at) VALUES (?, ?, ?)'
def table_columns_and_values_for_insert(file_paths)
timestamp = Time.now.utc.iso8601
 
sql = if postgresql?
"INSERT INTO #{table_columns_and_values} ON CONFLICT DO NOTHING;"
else
"INSERT IGNORE INTO #{table_columns_and_values};"
end
values = file_paths.map do |file_path|
ActiveRecord::Base.send(:sanitize_sql_array, ['(?, ?, ?)', file_path, timestamp, timestamp]) # rubocop:disable GitlabSecurity/PublicSend
end.join(', ')
 
timestamp = Time.now.utc.iso8601
sql = ActiveRecord::Base.send(:sanitize_sql_array, [sql, file_path, timestamp, timestamp]) # rubocop:disable GitlabSecurity/PublicSend
ActiveRecord::Base.connection.execute(sql)
"#{UntrackedFile.table_name} (path, created_at, updated_at) VALUES #{values}"
end
 
def postgresql?
@postgresql ||= Gitlab::Database.postgresql?
end
 
def can_bulk_insert_and_ignore_duplicates?
!postgresql_pre_9_5?
end
def postgresql_pre_9_5?
@postgresql_pre_9_5 ||= postgresql? &&
ActiveRecord::Base.connection.select_value('SHOW server_version_num').to_i < 90500
Loading
Loading
Loading
Loading
@@ -53,80 +53,178 @@ describe Gitlab::BackgroundMigration::PrepareUntrackedUploads, :migration, :side
expect(record.reload.path.size).to eq(519)
end
 
context 'when files were uploaded before and after hashed storage was enabled' do
let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) }
let!(:user) { create(:user, :with_avatar) }
let!(:project1) { create(:project, :with_avatar) }
let(:project2) { create(:project) } # instantiate after enabling hashed_storage
context "test bulk insert with ON CONFLICT DO NOTHING or IGNORE" do
around do |example|
# If this is CI, we use Postgres 9.2 so this whole context should be
# skipped since we're unable to use ON CONFLICT DO NOTHING or IGNORE.
if described_class.new.send(:can_bulk_insert_and_ignore_duplicates?)
example.run
end
end
 
before do
# Markdown upload before enabling hashed_storage
UploadService.new(project1, uploaded_file, FileUploader).execute
context 'when files were uploaded before and after hashed storage was enabled' do
let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) }
let!(:user) { create(:user, :with_avatar) }
let!(:project1) { create(:project, :with_avatar) }
let(:project2) { create(:project) } # instantiate after enabling hashed_storage
 
stub_application_setting(hashed_storage_enabled: true)
before do
# Markdown upload before enabling hashed_storage
UploadService.new(project1, uploaded_file, FileUploader).execute
 
# Markdown upload after enabling hashed_storage
UploadService.new(project2, uploaded_file, FileUploader).execute
end
stub_application_setting(hashed_storage_enabled: true)
 
it 'adds unhashed files to the untracked_files_for_uploads table' do
described_class.new.perform
# Markdown upload after enabling hashed_storage
UploadService.new(project2, uploaded_file, FileUploader).execute
end
 
expect(untracked_files_for_uploads.count).to eq(5)
end
it 'adds unhashed files to the untracked_files_for_uploads table' do
described_class.new.perform
 
it 'adds files with paths relative to CarrierWave.root' do
described_class.new.perform
untracked_files_for_uploads.all.each do |file|
expect(file.path.start_with?('uploads/')).to be_truthy
expect(untracked_files_for_uploads.count).to eq(5)
end
end
 
it 'does not add hashed files to the untracked_files_for_uploads table' do
described_class.new.perform
hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path
expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey
end
it 'adds files with paths relative to CarrierWave.root' do
described_class.new.perform
untracked_files_for_uploads.all.each do |file|
expect(file.path.start_with?('uploads/')).to be_truthy
end
end
 
it 'correctly schedules the follow-up background migration jobs' do
described_class.new.perform
it 'does not add hashed files to the untracked_files_for_uploads table' do
described_class.new.perform
 
expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5)
expect(BackgroundMigrationWorker.jobs.size).to eq(1)
end
hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path
expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey
end
 
# E.g. from a previous failed run of this background migration
context 'when there is existing data in untracked_files_for_uploads' do
before do
it 'correctly schedules the follow-up background migration jobs' do
described_class.new.perform
expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5)
expect(BackgroundMigrationWorker.jobs.size).to eq(1)
end
 
it 'does not error or produce duplicates of existing data' do
expect do
# E.g. from a previous failed run of this background migration
context 'when there is existing data in untracked_files_for_uploads' do
before do
described_class.new.perform
end.not_to change { untracked_files_for_uploads.count }.from(5)
end
it 'does not error or produce duplicates of existing data' do
expect do
described_class.new.perform
end.not_to change { untracked_files_for_uploads.count }.from(5)
end
end
# E.g. The installation is in use at the time of migration, and someone has
# just uploaded a file
context 'when there are files in /uploads/tmp' do
let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') }
before do
FileUtils.touch(tmp_file)
end
after do
FileUtils.rm(tmp_file)
end
it 'does not add files from /uploads/tmp' do
described_class.new.perform
expect(untracked_files_for_uploads.count).to eq(5)
end
end
end
end
context 'test bulk insert without ON CONFLICT DO NOTHING or IGNORE' do
before do
# If this is CI, we use Postgres 9.2 so this stub has no effect.
#
# If this is being run on Postgres 9.5+ or MySQL, then this stub allows us
# to test the bulk insert functionality without ON CONFLICT DO NOTHING or
# IGNORE.
allow_any_instance_of(described_class).to receive(:postgresql_pre_9_5?).and_return(true)
end
 
# E.g. The installation is in use at the time of migration, and someone has
# just uploaded a file
context 'when there are files in /uploads/tmp' do
let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') }
context 'when files were uploaded before and after hashed storage was enabled' do
let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) }
let!(:user) { create(:user, :with_avatar) }
let!(:project1) { create(:project, :with_avatar) }
let(:project2) { create(:project) } # instantiate after enabling hashed_storage
 
before do
FileUtils.touch(tmp_file)
end
# Markdown upload before enabling hashed_storage
UploadService.new(project1, uploaded_file, FileUploader).execute
 
after do
FileUtils.rm(tmp_file)
stub_application_setting(hashed_storage_enabled: true)
# Markdown upload after enabling hashed_storage
UploadService.new(project2, uploaded_file, FileUploader).execute
end
 
it 'does not add files from /uploads/tmp' do
it 'adds unhashed files to the untracked_files_for_uploads table' do
described_class.new.perform
 
expect(untracked_files_for_uploads.count).to eq(5)
end
it 'adds files with paths relative to CarrierWave.root' do
described_class.new.perform
untracked_files_for_uploads.all.each do |file|
expect(file.path.start_with?('uploads/')).to be_truthy
end
end
it 'does not add hashed files to the untracked_files_for_uploads table' do
described_class.new.perform
hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path
expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey
end
it 'correctly schedules the follow-up background migration jobs' do
described_class.new.perform
expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5)
expect(BackgroundMigrationWorker.jobs.size).to eq(1)
end
# E.g. from a previous failed run of this background migration
context 'when there is existing data in untracked_files_for_uploads' do
before do
described_class.new.perform
end
it 'does not error or produce duplicates of existing data' do
expect do
described_class.new.perform
end.not_to change { untracked_files_for_uploads.count }.from(5)
end
end
# E.g. The installation is in use at the time of migration, and someone has
# just uploaded a file
context 'when there are files in /uploads/tmp' do
let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') }
before do
FileUtils.touch(tmp_file)
end
after do
FileUtils.rm(tmp_file)
end
it 'does not add files from /uploads/tmp' do
described_class.new.perform
expect(untracked_files_for_uploads.count).to eq(5)
end
end
end
end
 
Loading
Loading
Loading
Loading
@@ -62,8 +62,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do
 
expect(appearance.reload.uploads.where("path like '%/header_logo/%'").first.attributes).to include(@appearance_header_logo_attributes)
expect(user2.reload.uploads.first.attributes).to include(@user2_avatar_attributes)
expect(project2.reload.uploads.first.attributes).to include(@project2_avatar_attributes)
expect(project2.uploads.last.attributes).to include(@project2_markdown_attributes)
expect(project2.reload.uploads.where(uploader: 'AvatarUploader').first.attributes).to include(@project2_avatar_attributes)
expect(project2.uploads.where(uploader: 'FileUploader').first.attributes).to include(@project2_markdown_attributes)
end
 
it 'ignores already-tracked uploads' do
Loading
Loading
@@ -71,8 +71,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do
 
expect(appearance.reload.uploads.where("path like '%/logo/%'").first.attributes).to include(@appearance_logo_attributes)
expect(user1.reload.uploads.first.attributes).to include(@user1_avatar_attributes)
expect(project1.reload.uploads.first.attributes).to include(@project1_avatar_attributes)
expect(project1.uploads.last.attributes).to include(@project1_markdown_attributes)
expect(project1.reload.uploads.where(uploader: 'AvatarUploader').first.attributes).to include(@project1_avatar_attributes)
expect(project1.uploads.where(uploader: 'FileUploader').first.attributes).to include(@project1_markdown_attributes)
end
 
it 'the temporary table untracked_files_for_uploads no longer exists' do
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment