Skip to content
Snippets Groups Projects
Commit c2e0e689 authored by Shinya Maeda's avatar Shinya Maeda
Browse files

Validate the existence of archived traces before removing live trace

Often live traces are removed even though the archived trace
doesn't exist. This commit checkes the existence strictly.
parent 1a3fda63
No related branches found
No related tags found
No related merge requests found
Showing
with 214 additions and 20 deletions
Loading
Loading
@@ -531,6 +531,14 @@ module Ci
trace.exist?
end
 
def has_live_trace?
trace.live_trace_exist?
end
def has_archived_trace?
trace.archived_trace_exist?
end
def artifacts_file
job_artifacts_archive&.file
end
Loading
Loading
Loading
Loading
@@ -176,6 +176,10 @@ module Ci
end
end
 
def self.archived_trace_exists_for?(job_id)
where(job_id: job_id).trace.take&.file&.file&.exists?
end
private
 
def file_format_adapter_class
Loading
Loading
Loading
Loading
@@ -2,8 +2,25 @@
 
module Ci
class ArchiveTraceService
def execute(job)
def execute(job, worker_name:)
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_live_trace?
Sidekiq.logger.warn(class: worker_name,
message: 'The job does not have live trace but going to be archived.',
job_id: job.id)
return
end
job.trace.archive!
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_archived_trace?
Sidekiq.logger.warn(class: worker_name,
message: 'The job does not have archived trace after archiving.',
job_id: job.id)
end
rescue ::Gitlab::Ci::Trace::AlreadyArchivedError
# It's already archived, thus we can safely ignore this exception.
rescue => e
Loading
Loading
@@ -11,7 +28,7 @@ module Ci
# If `archive!` keeps failing for over a week, that could incur data loss.
# (See more https://docs.gitlab.com/ee/administration/job_traces.html#new-live-trace-architecture)
# In order to avoid interrupting the system, we do not raise an exception here.
archive_error(e, job)
archive_error(e, job, worker_name)
end
 
private
Loading
Loading
@@ -22,9 +39,12 @@ module Ci
"Counter of failed attempts of trace archiving")
end
 
def archive_error(error, job)
def archive_error(error, job, worker_name)
failed_archive_counter.increment
Rails.logger.error "Failed to archive trace. id: #{job.id} message: #{error.message}" # rubocop:disable Gitlab/RailsLogger
Sidekiq.logger.warn(class: worker_name,
message: "Failed to archive trace. message: #{error.message}.",
job_id: job.id)
 
Gitlab::Sentry
.track_exception(error,
Loading
Loading
Loading
Loading
@@ -7,7 +7,7 @@ class ArchiveTraceWorker
# rubocop: disable CodeReuse/ActiveRecord
def perform(job_id)
Ci::Build.without_archived_trace.find_by(id: job_id).try do |job|
Ci::ArchiveTraceService.new.execute(job)
Ci::ArchiveTraceService.new.execute(job, worker_name: self.class.name)
end
end
# rubocop: enable CodeReuse/ActiveRecord
Loading
Loading
Loading
Loading
@@ -11,7 +11,7 @@ module Ci
# This could happen when ArchiveTraceWorker sidekiq jobs were lost by receiving SIGKILL
# More details in https://gitlab.com/gitlab-org/gitlab-ce/issues/36791
Ci::Build.finished.with_live_trace.find_each(batch_size: 100) do |build|
Ci::ArchiveTraceService.new.execute(build)
Ci::ArchiveTraceService.new.execute(build, worker_name: self.class.name)
end
end
# rubocop: enable CodeReuse/ActiveRecord
Loading
Loading
---
title: Extra logging for new live trace architecture
merge_request: 30892
author:
type: fixed
Loading
Loading
@@ -63,7 +63,15 @@ module Gitlab
end
 
def exist?
trace_artifact&.exists? || job.trace_chunks.any? || current_path.present? || old_trace.present?
archived_trace_exist? || live_trace_exist?
end
def archived_trace_exist?
trace_artifact&.exists?
end
def live_trace_exist?
job.trace_chunks.any? || current_path.present? || old_trace.present?
end
 
def read
Loading
Loading
@@ -167,7 +175,7 @@ module Gitlab
 
def clone_file!(src_stream, temp_dir)
FileUtils.mkdir_p(temp_dir)
Dir.mktmpdir('tmp-trace', temp_dir) do |dir_path|
Dir.mktmpdir("tmp-trace-#{job.id}", temp_dir) do |dir_path|
temp_path = File.join(dir_path, "job.log")
FileUtils.touch(temp_path)
size = IO.copy_stream(src_stream, temp_path)
Loading
Loading
Loading
Loading
@@ -166,6 +166,13 @@ module Gitlab
end
 
def destroy!
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless build.has_archived_trace?
Sidekiq.logger.warn(message: 'The job does not have archived trace but going to be destroyed.',
job_id: build.id)
end
trace_chunks.fast_destroy_all
@tell = @size = 0
ensure
Loading
Loading
Loading
Loading
@@ -442,5 +442,15 @@ describe Gitlab::Ci::Trace::ChunkedIO, :clean_gitlab_redis_cache do
 
expect(Ci::BuildTraceChunk.where(build: build).count).to eq(0)
end
context 'when the job does not have archived trace' do
it 'leaves a message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
message: 'The job does not have archived trace but going to be destroyed.',
job_id: build.id).and_call_original
subject
end
end
end
end
Loading
Loading
@@ -692,6 +692,34 @@ describe Ci::Build do
end
end
 
describe '#has_live_trace?' do
subject { build.has_live_trace? }
let(:build) { create(:ci_build, :trace_live) }
it { is_expected.to be_truthy }
context 'when build does not have live trace' do
let(:build) { create(:ci_build) }
it { is_expected.to be_falsy }
end
end
describe '#has_archived_trace?' do
subject { build.has_archived_trace? }
let(:build) { create(:ci_build, :trace_artifact) }
it { is_expected.to be_truthy }
context 'when build does not have archived trace' do
let(:build) { create(:ci_build) }
it { is_expected.to be_falsy }
end
end
describe '#has_job_artifacts?' do
subject { build.has_job_artifacts? }
 
Loading
Loading
Loading
Loading
@@ -70,6 +70,31 @@ describe Ci::JobArtifact do
end
end
 
describe '.archived_trace_exists_for?' do
subject { described_class.archived_trace_exists_for?(job_id) }
let!(:artifact) { create(:ci_job_artifact, :trace, job: job) }
let(:job) { create(:ci_build) }
context 'when the specified job_id exists' do
let(:job_id) { job.id }
it { is_expected.to be_truthy }
context 'when the job does have archived trace' do
let!(:artifact) { }
it { is_expected.to be_falsy }
end
end
context 'when the specified job_id does not exist' do
let(:job_id) { 10000 }
it { is_expected.to be_falsy }
end
end
describe 'callbacks' do
subject { create(:ci_job_artifact, :archive) }
 
Loading
Loading
Loading
Loading
@@ -3,7 +3,7 @@
require 'spec_helper'
 
describe Ci::ArchiveTraceService, '#execute' do
subject { described_class.new.execute(job) }
subject { described_class.new.execute(job, worker_name: ArchiveTraceWorker.name) }
 
context 'when job is finished' do
let(:job) { create(:ci_build, :success, :trace_live) }
Loading
Loading
@@ -25,6 +25,34 @@ describe Ci::ArchiveTraceService, '#execute' do
expect { subject }.not_to change { Ci::JobArtifact.trace.count }
end
end
context 'when job does not have trace' do
let(:job) { create(:ci_build, :success) }
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: 'The job does not have live trace but going to be archived.',
job_id: job.id)
subject
end
end
context 'when job failed to archive trace but did not raise an exception' do
before do
allow_any_instance_of(Gitlab::Ci::Trace).to receive(:archive!) {}
end
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: 'The job does not have archived trace after archiving.',
job_id: job.id)
subject
end
end
end
 
context 'when job is running' do
Loading
Loading
@@ -37,10 +65,10 @@ describe Ci::ArchiveTraceService, '#execute' do
issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502',
extra: { job_id: job.id } ).once
 
expect(Rails.logger)
.to receive(:error)
.with("Failed to archive trace. id: #{job.id} message: Job is not finished yet")
.and_call_original
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: "Failed to archive trace. message: Job is not finished yet.",
job_id: job.id).and_call_original
 
expect(Gitlab::Metrics)
.to receive(:counter)
Loading
Loading
Loading
Loading
@@ -720,6 +720,58 @@ shared_examples_for 'trace with enabled live trace feature' do
end
end
 
describe '#archived_trace_exist?' do
subject { trace.archived_trace_exist? }
context 'when trace does not exist' do
it { is_expected.to be_falsy }
end
context 'when archived trace exists' do
before do
create(:ci_job_artifact, :trace, job: build)
end
it { is_expected.to be_truthy }
end
context 'when live trace exists' do
before do
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
stream.write('abc')
end
end
it { is_expected.to be_falsy }
end
end
describe '#live_trace_exist?' do
subject { trace.live_trace_exist? }
context 'when trace does not exist' do
it { is_expected.to be_falsy }
end
context 'when archived trace exists' do
before do
create(:ci_job_artifact, :trace, job: build)
end
it { is_expected.to be_falsy }
end
context 'when live trace exists' do
before do
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
stream.write('abc')
end
end
it { is_expected.to be_truthy }
end
end
describe '#archive!' do
subject { trace.archive! }
 
Loading
Loading
Loading
Loading
@@ -11,7 +11,7 @@ describe ArchiveTraceWorker do
 
it 'executes service' do
expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(job)
.to receive(:execute).with(job, anything)
 
subject
end
Loading
Loading
Loading
Loading
@@ -34,7 +34,7 @@ describe Ci::ArchiveTracesCronWorker do
 
it 'executes service' do
expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(build)
.to receive(:execute).with(build, anything)
 
subject
end
Loading
Loading
@@ -60,7 +60,10 @@ describe Ci::ArchiveTracesCronWorker do
end
 
it 'puts a log' do
expect(Rails.logger).to receive(:error).with("Failed to archive trace. id: #{build.id} message: Unexpected error")
expect(Sidekiq.logger).to receive(:warn).with(
class: described_class.name,
message: "Failed to archive trace. message: Unexpected error.",
job_id: build.id)
 
subject
end
Loading
Loading
Loading
Loading
@@ -7,8 +7,6 @@ describe StuckCiJobsWorker do
 
let!(:runner) { create :ci_runner }
let!(:job) { create :ci_build, runner: runner }
let(:trace_lease_key) { "trace:write:lock:#{job.id}" }
let(:trace_lease_uuid) { SecureRandom.uuid }
let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_uuid) { SecureRandom.uuid }
 
Loading
Loading
@@ -16,7 +14,6 @@ describe StuckCiJobsWorker do
 
before do
stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
stub_exclusive_lease(trace_lease_key, trace_lease_uuid)
job.update!(status: status, updated_at: updated_at)
end
 
Loading
Loading
@@ -195,7 +192,6 @@ describe StuckCiJobsWorker do
end
 
it 'cancels exclusive leases after worker perform' do
expect_to_cancel_exclusive_lease(trace_lease_key, trace_lease_uuid)
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
 
worker.perform
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment