Instrument Unicorn with Ruby exporter

26ac691a · Paweł Chojnacki · Rémy Coutable · 53c626bc · 26ac691a · 26ac691a
Commit 26ac691a authored 7 years ago by Paweł Chojnacki Committed by Rémy Coutable 7 years ago
--- a/Gemfile
+++ b/Gemfile
@@ -285,6 +285,7 @@ group :metrics do
  # Prometheus
  gem 'prometheus-client-mmap', '~>0.7.0.beta5'
+  gem 'raindrops', '~> 0.18'
 end
 group :development do

--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -599,7 +599,7 @@ GEM
    premailer-rails (1.9.7)
      actionmailer (>= 3, < 6)
      premailer (~> 1.7, >= 1.7.9)
-    prometheus-client-mmap (0.7.0.beta5)
+    prometheus-client-mmap (0.7.0.beta7)
      mmap2 (~> 2.2.6)
    pry (0.10.4)
      coderay (~> 1.1.0)
@@ -658,7 +658,7 @@ GEM
      thor (>= 0.18.1, < 2.0)
    rainbow (2.2.2)
      rake
-    raindrops (0.17.0)
+    raindrops (0.18.0)
    rake (10.5.0)
    rblineprof (0.3.6)
      debugger-ruby_core_source (~> 1.3)
@@ -1062,6 +1062,7 @@ DEPENDENCIES
  rails-deprecated_sanitizer (~> 1.0.3)
  rails-i18n (~> 4.0.9)
  rainbow (~> 2.2)
+  raindrops (~> 0.18)
  rblineprof (~> 0.3.6)
  rdoc (~> 4.2)
  recaptcha (~> 3.0)

--- a/config/gitlab.yml.example
+++ b/config/gitlab.yml.example
@@ -543,6 +543,10 @@ production: &base
    #   enabled: true
    #   host: localhost
    #   port: 3808
+  prometheus:
+    # Time between sampling of unicorn socket metrics, in seconds
+    # unicorn_sampler_interval: 10
  #
  # 5. Extra customization

--- a/config/initializers/1_settings.rb
+++ b/config/initializers/1_settings.rb
@@ -494,6 +494,12 @@ Settings.webpack.dev_server['enabled'] ||= false
 Settings.webpack.dev_server['host']    ||= 'localhost'
 Settings.webpack.dev_server['port']    ||= 3808
+#
+# Prometheus metrics settings
+#
+Settings['prometheus'] ||= Settingslogic.new({})
+Settings.prometheus['unicorn_sampler_interval'] ||= 10
 #
 # Testing settings
 #

--- a/config/initializers/8_metrics.rb
+++ b/config/initializers/8_metrics.rb
@@ -119,6 +119,13 @@ def instrument_classes(instrumentation)
 end
 # rubocop:enable Metrics/AbcSize
+Gitlab::Metrics::UnicornSampler.initialize_instance(Settings.prometheus.unicorn_sampler_interval).start
+Gitlab::Application.configure do |config|
+  # 0 should be Sentry to catch errors in this middleware
+  config.middleware.insert(1, Gitlab::Metrics::ConnectionRackMiddleware)
+end
 if Gitlab::Metrics.enabled?
  require 'pathname'
  require 'influxdb'
@@ -175,7 +182,7 @@ if Gitlab::Metrics.enabled?
  GC::Profiler.enable
-  Gitlab::Metrics::Sampler.new.start
+  Gitlab::Metrics::InfluxSampler.initialize_instance.start
  module TrackNewRedisConnections
    def connect(*args)

--- a/lib/gitlab/metrics/base_sampler.rb
+++ b/lib/gitlab/metrics/base_sampler.rb
+require 'logger'
+module Gitlab
+  module Metrics
+    class BaseSampler
+      def self.initialize_instance(*args)
+        raise "#{name} singleton instance already initialized" if @instance
+        @instance = new(*args)
+        at_exit(&@instance.method(:stop))
+        @instance
+      end
+      def self.instance
+        @instance
+      end
+      attr_reader :running
+      # interval - The sampling interval in seconds.
+      def initialize(interval)
+        interval_half = interval.to_f / 2
+        @interval = interval
+        @interval_steps = (-interval_half..interval_half).step(0.1).to_a
+        @mutex = Mutex.new
+      end
+      def enabled?
+        true
+      end
+      def start
+        return unless enabled?
+        @mutex.synchronize do
+          return if running
+          @running = true
+          @thread = Thread.new do
+            sleep(sleep_interval)
+            while running
+              safe_sample
+              sleep(sleep_interval)
+            end
+          end
+        end
+      end
+      def stop
+        @mutex.synchronize do
+          return unless running
+          @running = false
+          if @thread
+            @thread.wakeup if @thread.alive?
+            @thread.join
+            @thread = nil
+          end
+        end
+      end
+      def safe_sample
+        sample
+      rescue => e
+        Rails.logger.warn("#{self.class}: #{e}, stopping")
+        stop
+      end
+      def sample
+        raise NotImplementedError
+      end
+      # Returns the sleep interval with a random adjustment.
+      #
+      # The random adjustment is put in place to ensure we:
+      #
+      # 1. Don't generate samples at the exact same interval every time (thus
+      #    potentially missing anything that happens in between samples).
+      # 2. Don't sample data at the same interval two times in a row.
+      def sleep_interval
+        while step = @interval_steps.sample
+          if step != @last_step
+            @last_step = step
+            return @interval + @last_step
+          end
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/metrics/connection_rack_middleware.rb
+++ b/lib/gitlab/metrics/connection_rack_middleware.rb
+module Gitlab
+  module Metrics
+    class ConnectionRackMiddleware
+      def initialize(app)
+        @app = app
+      end
+      def self.rack_request_count
+        @rack_request_count ||= Gitlab::Metrics.counter(:rack_request, 'Rack request count')
+      end
+      def self.rack_response_count
+        @rack_response_count ||= Gitlab::Metrics.counter(:rack_response, 'Rack response count')
+      end
+      def self.rack_uncaught_errors_count
+        @rack_uncaught_errors_count ||= Gitlab::Metrics.counter(:rack_uncaught_errors, 'Rack connections handling uncaught errors count')
+      end
+      def self.rack_execution_time
+        @rack_execution_time ||= Gitlab::Metrics.histogram(:rack_execution_time, 'Rack connection handling execution time',
+                                                           {}, [0.05, 0.1, 0.25, 0.5, 0.7, 1, 1.5, 2, 2.5, 3, 5, 7, 10])
+      end
+      def call(env)
+        method = env['REQUEST_METHOD'].downcase
+        started = Time.now.to_f
+        begin
+          ConnectionRackMiddleware.rack_request_count.increment(method: method)
+          status, headers, body = @app.call(env)
+          ConnectionRackMiddleware.rack_response_count.increment(method: method, status: status)
+          [status, headers, body]
+        rescue
+          ConnectionRackMiddleware.rack_uncaught_errors_count.increment
+          raise
+        ensure
+          elapsed = Time.now.to_f - started
+          ConnectionRackMiddleware.rack_execution_time.observe({}, elapsed)
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/metrics/sampler.rb
+++ b/lib/gitlab/metrics/sampler.rb
@@ -5,14 +5,11 @@ module Gitlab
    # This class is used to gather statistics that can't be directly associated
    # with a transaction such as system memory usage, garbage collection
    # statistics, etc.
-    class Sampler
+    class InfluxSampler < BaseSampler
      # interval - The sampling interval in seconds.
      def initialize(interval = Metrics.settings[:sample_interval])
-        interval_half = interval.to_f / 2
+        super(interval)
+        @last_step = nil
-        @interval       = interval
-        @interval_steps = (-interval_half..interval_half).step(0.1).to_a
-        @last_step      = nil
        @metrics = []
@@ -26,18 +23,6 @@ module Gitlab
        end
      end
-      def start
-        Thread.new do
-          Thread.current.abort_on_exception = true
-          loop do
-            sleep(sleep_interval)
-            sample
-          end
-        end
-      end
      def sample
        sample_memory_usage
        sample_file_descriptors
@@ -86,7 +71,7 @@ module Gitlab
      end
      def sample_gc
-        time  = GC::Profiler.total_time * 1000.0
+        time = GC::Profiler.total_time * 1000.0
        stats = GC.stat.merge(total_time: time)
        # We want the difference of GC runs compared to the last sample, not the
@@ -111,23 +96,6 @@ module Gitlab
      def sidekiq?
        Sidekiq.server?
      end
-      # Returns the sleep interval with a random adjustment.
-      #
-      # The random adjustment is put in place to ensure we:
-      #
-      # 1. Don't generate samples at the exact same interval every time (thus
-      #    potentially missing anything that happens in between samples).
-      # 2. Don't sample data at the same interval two times in a row.
-      def sleep_interval
-        while step = @interval_steps.sample
-          if step != @last_step
-            @last_step = step
-            return @interval + @last_step
-          end
-        end
-      end
    end
  end
 end
--- a/lib/gitlab/metrics/prometheus.rb
+++ b/lib/gitlab/metrics/prometheus.rb
@@ -29,8 +29,8 @@ module Gitlab
        provide_metric(name) || registry.summary(name, docstring, base_labels)
      end
-      def gauge(name, docstring, base_labels = {})
+      def gauge(name, docstring, base_labels = {}, multiprocess_mode = :all)
-        provide_metric(name) || registry.gauge(name, docstring, base_labels)
+        provide_metric(name) || registry.gauge(name, docstring, base_labels, multiprocess_mode)
      end
      def histogram(name, docstring, base_labels = {}, buckets = ::Prometheus::Client::Histogram::DEFAULT_BUCKETS)

--- a/lib/gitlab/metrics/unicorn_sampler.rb
+++ b/lib/gitlab/metrics/unicorn_sampler.rb
+module Gitlab
+  module Metrics
+    class UnicornSampler < BaseSampler
+      def initialize(interval)
+        super(interval)
+      end
+      def unicorn_active_connections
+        @unicorn_active_connections ||= Gitlab::Metrics.gauge(:unicorn_active_connections, 'Unicorn active connections', {}, :max)
+      end
+      def unicorn_queued_connections
+        @unicorn_queued_connections ||= Gitlab::Metrics.gauge(:unicorn_queued_connections, 'Unicorn queued connections', {}, :max)
+      end
+      def enabled?
+        # Raindrops::Linux.tcp_listener_stats is only present on Linux
+        unicorn_with_listeners? && Raindrops::Linux.respond_to?(:tcp_listener_stats)
+      end
+      def sample
+        Raindrops::Linux.tcp_listener_stats(tcp_listeners).each do |addr, stats|
+          unicorn_active_connections.set({ type: 'tcp', address: addr }, stats.active)
+          unicorn_queued_connections.set({ type: 'tcp', address: addr }, stats.queued)
+        end
+        Raindrops::Linux.unix_listener_stats(unix_listeners).each do |addr, stats|
+          unicorn_active_connections.set({ type: 'unix', address: addr }, stats.active)
+          unicorn_queued_connections.set({ type: 'unix', address: addr }, stats.queued)
+        end
+      end
+      private
+      def tcp_listeners
+        @tcp_listeners ||= Unicorn.listener_names.grep(%r{\A[^/]+:\d+\z})
+      end
+      def unix_listeners
+        @unix_listeners ||= Unicorn.listener_names - tcp_listeners
+      end
+      def unicorn_with_listeners?
+        defined?(Unicorn) && Unicorn.listener_names.any?
+      end
+    end
+  end
+end
--- a/spec/initializers/8_metrics_spec.rb
+++ b/spec/initializers/8_metrics_spec.rb
 require 'spec_helper'
-require_relative '../../config/initializers/8_metrics'
 describe 'instrument_classes', lib: true do
  let(:config) { double(:config) }
+  let(:unicorn_sampler) { double(:unicorn_sampler) }
+  let(:influx_sampler) { double(:influx_sampler) }
  before do
    allow(config).to receive(:instrument_method)
    allow(config).to receive(:instrument_methods)
    allow(config).to receive(:instrument_instance_method)
    allow(config).to receive(:instrument_instance_methods)
+    allow(Gitlab::Metrics::UnicornSampler).to receive(:initialize_instance).and_return(unicorn_sampler)
+    allow(Gitlab::Metrics::InfluxSampler).to receive(:initialize_instance).and_return(influx_sampler)
+    allow(unicorn_sampler).to receive(:start)
+    allow(influx_sampler).to receive(:start)
+    allow(Gitlab::Application).to receive(:configure)
  end
  it 'can autoload and instrument all files' do
+    require_relative '../../config/initializers/8_metrics'
    expect { instrument_classes(config) }.not_to raise_error
  end
 end
--- a/spec/lib/gitlab/metrics/connection_rack_middleware_spec.rb
+++ b/spec/lib/gitlab/metrics/connection_rack_middleware_spec.rb
+require 'spec_helper'
+describe Gitlab::Metrics::ConnectionRackMiddleware do
+  let(:app) { double('app') }
+  subject { described_class.new(app) }
+  around do |example|
+    Timecop.freeze { example.run }
+  end
+  describe '#call' do
+    let(:status) { 100 }
+    let(:env) { { 'REQUEST_METHOD' => 'GET' } }
+    let(:stack_result) { [status, {}, 'body'] }
+    before do
+      allow(app).to receive(:call).and_return(stack_result)
+    end
+    context '@app.call succeeds with 200' do
+      before do
+        allow(app).to receive(:call).and_return([200, nil, nil])
+      end
+      it 'increments response count with status label' do
+        expect(described_class).to receive_message_chain(:rack_response_count, :increment).with(include(status: 200, method: 'get'))
+        subject.call(env)
+      end
+      it 'increments requests count' do
+        expect(described_class).to receive_message_chain(:rack_request_count, :increment).with(method: 'get')
+        subject.call(env)
+      end
+      it 'measures execution time' do
+        execution_time = 10
+        allow(app).to receive(:call) do |*args|
+          Timecop.freeze(execution_time.seconds)
+        end
+        expect(described_class).to receive_message_chain(:rack_execution_time, :observe).with({}, execution_time)
+        subject.call(env)
+      end
+    end
+    context '@app.call throws exception' do
+      let(:rack_response_count) { double('rack_response_count') }
+      before do
+        allow(app).to receive(:call).and_raise(StandardError)
+        allow(described_class).to receive(:rack_response_count).and_return(rack_response_count)
+      end
+      it 'increments exceptions count' do
+        expect(described_class).to receive_message_chain(:rack_uncaught_errors_count, :increment)
+        expect { subject.call(env) }.to raise_error(StandardError)
+      end
+      it 'increments requests count' do
+        expect(described_class).to receive_message_chain(:rack_request_count, :increment).with(method: 'get')
+        expect { subject.call(env) }.to raise_error(StandardError)
+      end
+      it "does't increment response count" do
+        expect(described_class.rack_response_count).not_to receive(:increment)
+        expect { subject.call(env) }.to raise_error(StandardError)
+      end
+      it 'measures execution time' do
+        execution_time = 10
+        allow(app).to receive(:call) do |*args|
+          Timecop.freeze(execution_time.seconds)
+          raise StandardError
+        end
+        expect(described_class).to receive_message_chain(:rack_execution_time, :observe).with({}, execution_time)
+        expect { subject.call(env) }.to raise_error(StandardError)
+      end
+    end
+  end
+end
--- a/spec/lib/gitlab/metrics/sampler_spec.rb
+++ b/spec/lib/gitlab/metrics/sampler_spec.rb
 require 'spec_helper'
-describe Gitlab::Metrics::Sampler do
+describe Gitlab::Metrics::InfluxSampler do
  let(:sampler) { described_class.new(5) }
  after do
@@ -8,10 +8,10 @@ describe Gitlab::Metrics::Sampler do
  end
  describe '#start' do
-    it 'gathers a sample at a given interval' do
+    it 'runs once and gathers a sample at a given interval' do
-      expect(sampler).to receive(:sleep).with(a_kind_of(Numeric))
+      expect(sampler).to receive(:sleep).with(a_kind_of(Numeric)).twice
-      expect(sampler).to receive(:sample)
+      expect(sampler).to receive(:sample).once
-      expect(sampler).to receive(:loop).and_yield
+      expect(sampler).to receive(:running).and_return(false, true, false)
      sampler.start.join
    end

--- a/spec/lib/gitlab/metrics/unicorn_sampler_spec.rb
+++ b/spec/lib/gitlab/metrics/unicorn_sampler_spec.rb
+require 'spec_helper'
+describe Gitlab::Metrics::UnicornSampler do
+  subject { described_class.new(1.second) }
+  describe '#sample' do
+    let(:unicorn) { double('unicorn') }
+    let(:raindrops) { double('raindrops') }
+    let(:stats) { double('stats') }
+    before do
+      stub_const('Unicorn', unicorn)
+      stub_const('Raindrops::Linux', raindrops)
+      allow(raindrops).to receive(:unix_listener_stats).and_return({})
+      allow(raindrops).to receive(:tcp_listener_stats).and_return({})
+    end
+    context 'unicorn listens on unix sockets' do
+      let(:socket_address) { '/some/sock' }
+      let(:sockets) { [socket_address] }
+      before do
+        allow(unicorn).to receive(:listener_names).and_return(sockets)
+      end
+      it 'samples socket data' do
+        expect(raindrops).to receive(:unix_listener_stats).with(sockets)
+        subject.sample
+      end
+      context 'stats collected' do
+        before do
+          allow(stats).to receive(:active).and_return('active')
+          allow(stats).to receive(:queued).and_return('queued')
+          allow(raindrops).to receive(:unix_listener_stats).and_return({ socket_address => stats })
+        end
+        it 'updates metrics type unix and with addr' do
+          labels = { type: 'unix', address: socket_address }
+          expect(subject).to receive_message_chain(:unicorn_active_connections, :set).with(labels, 'active')
+          expect(subject).to receive_message_chain(:unicorn_queued_connections, :set).with(labels, 'queued')
+          subject.sample
+        end
+      end
+    end
+    context 'unicorn listens on tcp sockets' do
+      let(:tcp_socket_address) { '0.0.0.0:8080' }
+      let(:tcp_sockets) { [tcp_socket_address] }
+      before do
+        allow(unicorn).to receive(:listener_names).and_return(tcp_sockets)
+      end
+      it 'samples socket data' do
+        expect(raindrops).to receive(:tcp_listener_stats).with(tcp_sockets)
+        subject.sample
+      end
+      context 'stats collected' do
+        before do
+          allow(stats).to receive(:active).and_return('active')
+          allow(stats).to receive(:queued).and_return('queued')
+          allow(raindrops).to receive(:tcp_listener_stats).and_return({ tcp_socket_address => stats })
+        end
+        it 'updates metrics type unix and with addr' do
+          labels = { type: 'tcp', address: tcp_socket_address }
+          expect(subject).to receive_message_chain(:unicorn_active_connections, :set).with(labels, 'active')
+          expect(subject).to receive_message_chain(:unicorn_queued_connections, :set).with(labels, 'queued')
+          subject.sample
+        end
+      end
+    end
+  end
+  describe '#start' do
+    context 'when enabled' do
+      before do
+        allow(subject).to receive(:enabled?).and_return(true)
+      end
+      it 'creates new thread' do
+        expect(Thread).to receive(:new)
+        subject.start
+      end
+    end
+    context 'when disabled' do
+      before do
+        allow(subject).to receive(:enabled?).and_return(false)
+      end
+      it "doesn't create new thread" do
+        expect(Thread).not_to receive(:new)
+        subject.start
+      end
+    end
+  end
+end