Skip to content
Snippets Groups Projects
Select Git revision
  • move-gl-dropdown
  • improve-table-pagination-spec
  • move-markdown-preview
  • winh-fix-merge-request-spec
  • master default
  • index-namespaces-lower-name
  • winh-single-karma-test
  • 10-3-stable
  • 36782-replace-team-user-role-with-add_role-user-in-specs
  • winh-modal-internal-state
  • tz-ide-file-icons
  • 38869-milestone-select
  • update-autodevops-template
  • jivl-activate-repo-cookie-preferences
  • qa-add-deploy-key
  • docs-move-article-ldap
  • 40780-choose-file
  • 22643-manual-job-page
  • refactor-cluster-show-page-conservative
  • dm-sidekiq-versioning
  • v10.4.0.pre
  • v10.3.0
  • v10.3.0-rc5
  • v10.3.0-rc4
  • v10.3.0-rc3
  • v10.3.0-rc2
  • v10.2.5
  • v10.3.0-rc1
  • v10.0.7
  • v10.1.5
  • v10.2.4
  • v10.2.3
  • v10.2.2
  • v10.2.1
  • v10.3.0.pre
  • v10.2.0
  • v10.2.0-rc4
  • v10.2.0-rc3
  • v10.1.4
  • v10.2.0-rc2
40 results

median.rb

Forked from GitLab.org / GitLab FOSS
5887 commits behind the upstream repository.
median.rb 4.39 KiB
# https://www.periscopedata.com/blog/medians-in-sql.html
module Gitlab
  module Database
    module Median
      def median_datetime(arel_table, query_so_far, column_sym)
        median_queries =
          if Gitlab::Database.postgresql?
            pg_median_datetime_sql(arel_table, query_so_far, column_sym)
          elsif Gitlab::Database.mysql?
            mysql_median_datetime_sql(arel_table, query_so_far, column_sym)
          end

        results = Array.wrap(median_queries).map do |query|
          ActiveRecord::Base.connection.execute(query)
        end
        extract_median(results).presence
      end

      def extract_median(results)
        result = results.compact.first

        if Gitlab::Database.postgresql?
          result = result.first.presence
          median = result['median'] if result
          median.to_f if median
        elsif Gitlab::Database.mysql?
          result.to_a.flatten.first
        end
      end

      def mysql_median_datetime_sql(arel_table, query_so_far, column_sym)
        query = arel_table
                .from(arel_table.project(Arel.sql('*')).order(arel_table[column_sym]).as(arel_table.table_name))
                .project(average([arel_table[column_sym]], 'median'))
                .where(
                  Arel::Nodes::Between.new(
                    Arel.sql("(select @row_id := @row_id + 1)"),
                    Arel::Nodes::And.new(
                      [Arel.sql('@ct/2.0'),
                       Arel.sql('@ct/2.0 + 1')]
                    )
                  )
                ).
                # Disallow negative values
                where(arel_table[column_sym].gteq(0))

        [
          Arel.sql("CREATE TEMPORARY TABLE IF NOT EXISTS #{query_so_far.to_sql}"),
          Arel.sql("set @ct := (select count(1) from #{arel_table.table_name});"),
          Arel.sql("set @row_id := 0;"),
          query.to_sql,
          Arel.sql("DROP TEMPORARY TABLE IF EXISTS #{arel_table.table_name};")
        ]
      end

      def pg_median_datetime_sql(arel_table, query_so_far, column_sym)
        # Create a CTE with the column we're operating on, row number (after sorting by the column
        # we're operating on), and count of the table we're operating on (duplicated across) all rows
        # of the CTE. For example, if we're looking to find the median of the `projects.star_count`
        # column, the CTE might look like this:
        #
        #  star_count | row_id | ct
        # ------------+--------+----
        #           5 |      1 |  3
        #           9 |      2 |  3
        #          15 |      3 |  3
        cte_table = Arel::Table.new("ordered_records")
        cte = Arel::Nodes::As.new(
          cte_table,
          arel_table
            .project(
              arel_table[column_sym].as(column_sym.to_s),
              Arel::Nodes::Over.new(Arel::Nodes::NamedFunction.new("row_number", []),
                                    Arel::Nodes::Window.new.order(arel_table[column_sym])).as('row_id'),
              arel_table.project("COUNT(1)").as('ct')).
            # Disallow negative values
            where(arel_table[column_sym].gteq(zero_interval)))

        # From the CTE, select either the middle row or the middle two rows (this is accomplished
        # by 'where cte.row_id between cte.ct / 2.0 AND cte.ct / 2.0 + 1'). Find the average of the
        # selected rows, and this is the median value.
        cte_table.project(average([extract_epoch(cte_table[column_sym])], "median"))
          .where(
            Arel::Nodes::Between.new(
              cte_table[:row_id],
              Arel::Nodes::And.new(
                [(cte_table[:ct] / Arel.sql('2.0')),
                 (cte_table[:ct] / Arel.sql('2.0') + 1)]
              )
            )
          )
          .with(query_so_far, cte)
          .to_sql
      end

      private

      def average(args, as)
        Arel::Nodes::NamedFunction.new("AVG", args, as)
      end

      def extract_epoch(arel_attribute)
        Arel.sql(%Q{EXTRACT(EPOCH FROM "#{arel_attribute.relation.name}"."#{arel_attribute.name}")})
      end

      def extract_diff_epoch(diff)
        return diff unless Gitlab::Database.postgresql?

        Arel.sql(%Q{EXTRACT(EPOCH FROM (#{diff.to_sql}))})
      end

      # Need to cast '0' to an INTERVAL before we can check if the interval is positive
      def zero_interval
        Arel::Nodes::NamedFunction.new("CAST", [Arel.sql("'0' AS INTERVAL")])
      end
    end
  end
end