Skip to content
Snippets Groups Projects
Commit ac847fba authored by Aleksei Lipniagov's avatar Aleksei Lipniagov Committed by Bob Van Landuyt :neckbeard:
Browse files

Log Optimistic Locks with retries

Use structured logging for gathering info on Optimistic Locks which were
not able to succeed without retries.
parent e8043280
No related branches found
No related tags found
No related merge requests found
Showing
with 125 additions and 47 deletions
Loading
Loading
@@ -573,7 +573,7 @@ def auto_canceled?
end
 
def cancel_running(retries: nil)
retry_optimistic_lock(cancelable_statuses, retries) do |cancelable|
retry_optimistic_lock(cancelable_statuses, retries, name: 'ci_pipeline_cancel_running') do |cancelable|
cancelable.find_each do |job|
yield(job) if block_given?
job.cancel
Loading
Loading
@@ -744,7 +744,7 @@ def notes
end
 
def set_status(new_status)
retry_optimistic_lock(self) do
retry_optimistic_lock(self, name: 'ci_pipeline_set_status') do
case new_status
when 'created' then nil
when 'waiting_for_resource' then request_resource
Loading
Loading
Loading
Loading
@@ -62,7 +62,7 @@ def artifacts_locked?
end
 
def update_status_by!(pipeline)
retry_lock(self) do
retry_lock(self, name: 'ci_ref_update_status_by') do
next unless last_finished_pipeline_id == pipeline.id
 
case pipeline.status
Loading
Loading
Loading
Loading
@@ -84,7 +84,7 @@ class Stage < ApplicationRecord
end
 
def set_status(new_status)
retry_optimistic_lock(self) do
retry_optimistic_lock(self, name: 'ci_stage_set_status') do
case new_status
when 'created' then nil
when 'waiting_for_resource' then request_resource
Loading
Loading
Loading
Loading
@@ -242,7 +242,7 @@ def stop_action_available?
def cancel_deployment_jobs!
jobs = active_deployments.with_deployable
jobs.each do |deployment|
Gitlab::OptimisticLocking.retry_lock(deployment.deployable) do |deployable|
Gitlab::OptimisticLocking.retry_lock(deployment.deployable, name: 'environment_cancel_deployment_jobs') do |deployable|
deployable.cancel! if deployable&.cancelable?
end
rescue => e
Loading
Loading
Loading
Loading
@@ -43,7 +43,7 @@ def execute(bridge)
private
 
def update_bridge_status!(bridge, pipeline)
Gitlab::OptimisticLocking.retry_lock(bridge) do |subject|
Gitlab::OptimisticLocking.retry_lock(bridge, name: 'create_downstream_pipeline_update_bridge_status') do |subject|
if pipeline.created_successfully?
# If bridge uses `strategy:depend` we leave it running
# and update the status when the downstream pipeline completes.
Loading
Loading
Loading
Loading
@@ -80,7 +80,7 @@ def update_processable!(processable)
return unless Ci::HasStatus::COMPLETED_STATUSES.include?(status)
 
# transition status if possible
Gitlab::OptimisticLocking.retry_lock(processable) do |subject|
Gitlab::OptimisticLocking.retry_lock(processable, name: 'atomic_processing_update_processable') do |subject|
Ci::ProcessBuildService.new(project, subject.user)
.execute(subject, status)
 
Loading
Loading
Loading
Loading
@@ -152,7 +152,7 @@ def assign_runner!(build, params)
end
 
def scheduler_failure!(build)
Gitlab::OptimisticLocking.retry_lock(build, 3) do |subject|
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'register_job_scheduler_failure') do |subject|
subject.drop!(:scheduler_failure)
end
rescue => ex
Loading
Loading
Loading
Loading
@@ -19,7 +19,7 @@ def execute(build)
mark_subsequent_stages_as_processable(build)
build.pipeline.reset_ancestor_bridges!
 
Gitlab::OptimisticLocking.retry_lock(new_build, &:enqueue)
Gitlab::OptimisticLocking.retry_lock(new_build, name: 'retry_build', &:enqueue)
 
MergeRequests::AddTodoWhenBuildFailsService
.new(project, current_user)
Loading
Loading
@@ -68,7 +68,7 @@ def create_build!(attributes)
 
def mark_subsequent_stages_as_processable(build)
build.pipeline.processables.skipped.after_stage(build.stage_idx).find_each do |skipped|
retry_optimistic_lock(skipped) { |build| build.process(current_user) }
retry_optimistic_lock(skipped, name: 'ci_retry_build_mark_subsequent_stages') { |build| build.process(current_user) }
end
end
end
Loading
Loading
Loading
Loading
@@ -23,7 +23,7 @@ def execute(pipeline)
end
 
pipeline.builds.latest.skipped.find_each do |skipped|
retry_optimistic_lock(skipped) { |build| build.process(current_user) }
retry_optimistic_lock(skipped, name: 'ci_retry_pipeline') { |build| build.process(current_user) }
end
 
pipeline.reset_ancestor_bridges!
Loading
Loading
Loading
Loading
@@ -12,7 +12,7 @@ def execute
return unless @deployment&.running?
 
older_deployments.find_each do |older_deployment|
Gitlab::OptimisticLocking.retry_lock(older_deployment.deployable) do |deployable|
Gitlab::OptimisticLocking.retry_lock(older_deployment.deployable, name: 'older_deployments_drop') do |deployable|
deployable.drop(:forward_deployment_failure)
end
rescue => e
Loading
Loading
Loading
Loading
@@ -60,7 +60,7 @@ def unlock!
private
 
def retrieve_with_lock(find_only: false)
create_or_find!(find_only: find_only).tap { |state| retry_optimistic_lock(state) { |state| yield state } }
create_or_find!(find_only: find_only).tap { |state| retry_optimistic_lock(state, name: 'terraform_remote_state_handler_retrieve') { |state| yield state } }
end
 
def create_or_find!(find_only:)
Loading
Loading
Loading
Loading
@@ -70,7 +70,7 @@ def search(status, timeout, condition)
 
def drop_build(type, build, status, timeout, reason)
Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout}, reason: #{reason})"
Gitlab::OptimisticLocking.retry_lock(build, 3) do |b|
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b|
b.drop(reason)
end
rescue => ex
Loading
Loading
---
title: Log Optimistic Locks with retries
merge_request: 55187
author:
type: other
Loading
Loading
@@ -25,7 +25,7 @@ def initialize(*)
def perform!
return unless limit.exceeded?
 
retry_optimistic_lock(pipeline) do
retry_optimistic_lock(pipeline, name: 'ci_pipeline_chain_limit_activity') do
pipeline.drop!(:activity_limit_exceeded)
limit.log_error!(project_id: project.id, plan: project.actual_plan_name)
end
Loading
Loading
Loading
Loading
@@ -25,7 +25,7 @@ def initialize(*)
def perform!
return unless limit.exceeded?
 
retry_optimistic_lock(pipeline) do
retry_optimistic_lock(pipeline, name: 'ci_pipeline_chain_limit_job_activity') do
pipeline.drop!(:job_activity_limit_exceeded)
limit.log_error!(project_id: project.id, plan: project.actual_plan_name)
end
Loading
Loading
Loading
Loading
@@ -10,7 +10,7 @@ class CancelPendingPipelines < Chain::Base
def perform!
return unless project.auto_cancel_pending_pipelines?
 
Gitlab::OptimisticLocking.retry_lock(auto_cancelable_pipelines) do |cancelables|
Gitlab::OptimisticLocking.retry_lock(auto_cancelable_pipelines, name: 'cancel_pending_pipelines') do |cancelables|
cancelables.find_each do |cancelable|
cancelable.auto_cancel_running(pipeline)
end
Loading
Loading
Loading
Loading
@@ -2,22 +2,49 @@
 
module Gitlab
module OptimisticLocking
MAX_RETRIES = 100
module_function
 
def retry_lock(subject, retries = nil, &block)
retries ||= 100
# TODO(Observability): We should be recording details of the number of retries and the duration of the total execution here
ActiveRecord::Base.transaction do
yield(subject)
end
rescue ActiveRecord::StaleObjectError
retries -= 1
raise unless retries >= 0
def retry_lock(subject, max_retries = MAX_RETRIES, name:, &block)
start_time = Gitlab::Metrics::System.monotonic_time
retry_attempts = 0
begin
ActiveRecord::Base.transaction do
yield(subject)
end
rescue ActiveRecord::StaleObjectError
raise unless retry_attempts < max_retries
 
subject.reset
retry
subject.reset
retry_attempts += 1
retry
ensure
elapsed_time = Gitlab::Metrics::System.monotonic_time - start_time
log_optimistic_lock_retries(
name: name,
retry_attempts: retry_attempts,
elapsed_time: elapsed_time)
end
end
 
alias_method :retry_optimistic_lock, :retry_lock
def log_optimistic_lock_retries(name:, retry_attempts:, elapsed_time:)
return unless retry_attempts > 0
retry_lock_logger.info(
message: "Optimistic Lock released with retries",
name: name,
retries: retry_attempts,
time_s: elapsed_time)
end
def retry_lock_logger
@retry_lock_logger ||= Gitlab::Services::Logger.build
end
end
end
Loading
Loading
@@ -7,35 +7,81 @@
let!(:pipeline2) { Ci::Pipeline.find(pipeline.id) }
 
describe '#retry_lock' do
it 'does not reload object if state changes' do
expect(pipeline).not_to receive(:reset)
expect(pipeline).to receive(:succeed).and_call_original
let(:name) { 'optimistic_locking_spec' }
 
described_class.retry_lock(pipeline) do |subject|
subject.succeed
context 'when state changed successfully without retries' do
subject do
described_class.retry_lock(pipeline, name: name) do |lock_subject|
lock_subject.succeed
end
end
end
 
it 'retries action if exception is raised' do
pipeline.succeed
it 'does not reload object' do
expect(pipeline).not_to receive(:reset)
expect(pipeline).to receive(:succeed).and_call_original
subject
end
 
expect(pipeline2).to receive(:reset).and_call_original
expect(pipeline2).to receive(:drop).twice.and_call_original
it 'does not create log record' do
expect(described_class.retry_lock_logger).not_to receive(:info)
 
described_class.retry_lock(pipeline2) do |subject|
subject.drop
subject
end
end
 
it 'raises exception when too many retries' do
expect(pipeline).to receive(:drop).twice.and_call_original
context 'when at least one retry happened, the change succeeded' do
subject do
described_class.retry_lock(pipeline2, name: 'optimistic_locking_spec') do |lock_subject|
lock_subject.drop
end
end
it 'completes the action' do
pipeline.succeed
expect(pipeline2).to receive(:reset).and_call_original
expect(pipeline2).to receive(:drop).twice.and_call_original
subject
end
it 'creates a single log record' do
pipeline.succeed
expect(described_class.retry_lock_logger)
.to receive(:info)
.once
.with(hash_including(:time_s, name: name, retries: 1))
 
expect do
described_class.retry_lock(pipeline, 1) do |subject|
subject.lock_version = 100
subject.drop
subject
end
end
context 'when MAX_RETRIES attempts exceeded' do
subject do
described_class.retry_lock(pipeline, max_retries, name: name) do |lock_subject|
lock_subject.lock_version = 100
lock_subject.drop
end
end.to raise_error(ActiveRecord::StaleObjectError)
end
let(:max_retries) { 2 }
it 'raises an exception' do
expect(pipeline).to receive(:drop).exactly(max_retries + 1).times.and_call_original
expect { subject }.to raise_error(ActiveRecord::StaleObjectError)
end
it 'creates a single log record' do
expect(described_class.retry_lock_logger)
.to receive(:info)
.once
.with(hash_including(:time_s, name: name, retries: max_retries))
expect { subject }.to raise_error(ActiveRecord::StaleObjectError)
end
end
end
 
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment