Integrate mysql-to-postgres conversion

b9bbad52 · Jacob Vosmaer · bf1e976d · b9bbad52 · b9bbad52 · b9bbad52
Commit b9bbad52 authored 9 years ago by Jacob Vosmaer
--- a/lib/backup/database.rb
+++ b/lib/backup/database.rb
 require 'yaml'
+require 'open3'
  
 module Backup
  class Database
@@ -17,7 +18,7 @@ module Backup
      FileUtils.mkdir_p(@db_dir) unless Dir.exists?(@db_dir)
    end
  
-    def dump
+    def dump(mysql_to_postgresql=false)
      FileUtils.rm_f(db_file_name)
      compress_rd, compress_wr = IO.pipe
      compress_pid = spawn(*%W(gzip -c), in: compress_rd, out: [db_file_name, 'w', 0600])
@@ -26,7 +27,9 @@ module Backup
      dump_pid = case config["adapter"]
      when /^mysql/ then
        $progress.print "Dumping MySQL database #{config['database']} ... "
-        spawn('mysqldump', *mysql_args, config['database'], *TABLES, out: compress_wr)
+        args = mysql_args
+        args << '--compatible=postgresql' if mysql_to_postgresql
+        spawn('mysqldump', *args, config['database'], *TABLES, out: compress_wr)
      when "postgresql" then
        $progress.print "Dumping PostgreSQL database #{config['database']} ... "
        pg_env
@@ -38,6 +41,42 @@ module Backup
  
      report_success(success)
      abort 'Backup failed' unless success
+      convert_to_postgresql if mysql_to_postgresql
+    end
+
+    def convert_to_postgresql
+      mysql_dump_gz = db_file_name + '.mysql'
+      psql_dump_gz = db_file_name + '.psql'
+      drop_indexes_sql = File.join(db_dir, 'drop_indexes.sql')
+
+      File.rename(db_file_name, mysql_dump_gz)
+
+      $progress.print "Converting MySQL database dump to Postgres ... "
+      statuses = Open3.pipeline(
+        %W(gzip -cd #{mysql_dump_gz}),
+        %W(python lib/support/mysql-postgresql-converter/db_converter.py - - #{drop_indexes_sql}),
+        %W(gzip -c),
+        out: [psql_dump_gz, 'w', 0600]
+      )
+
+      if !statuses.compact.all?(&:success?)
+        abort "mysql-to-postgresql-converter failed"
+      end
+      $progress.puts '[DONE]'.green
+
+      $progress.print "Splicing in 'DROP INDEX' statements ... "
+      statuses = Open3.pipeline(
+        %W(lib/support/mysql-postgresql-converter/splice_drop_indexes #{psql_dump_gz} #{drop_indexes_sql}),
+        %W(gzip -c),
+        out: [db_file_name, 'w', 0600]
+      )
+      if !statuses.compact.all?(&:success?)
+        abort "Failed to splice in 'DROP INDEXES' statements"
+      end
+
+      $progress.puts '[DONE]'.green
+    ensure
+      FileUtils.rm_f([mysql_dump_gz, psql_dump_gz, drop_indexes_sql])
    end
  
    def restore

--- a/lib/support/mysql-postgresql-converter/README.md
+++ b/lib/support/mysql-postgresql-converter/README.md
@@ -35,17 +35,14 @@ First, dump your MySQL database in PostgreSQL-compatible format
  
 Then, convert it using the dbconverter.py script.
  
-    python db_converter.py databasename.mysql databasename.psql
+    python db_converter.py databasename.mysql - drop_indexes.sql | gzip -c > databasename.unfinished.psql.gz
  
 It'll print progress to the terminal
  
 Now we have a DB dump that can be imported but the dump will be slow due
-to existing indexes. We use 'ed' to edit the DB dump file and move the
-'DROP INDEX' statements to the start of the import. Ed is not the fastest
-tool for this job if your DB dump is multiple gigabytes. (Patches to
-the converter are welcome!)
+to existing indexes.
  
-    ed -s databasename.psql < move_drop_indexes.ed
+    ./splice_drop_indexes databasename.unfinished.psql.gz drop_indexes.sql > databasename.psql
  
 Next, load your new dump into a fresh PostgreSQL database using: 
  

--- a/lib/support/mysql-postgresql-converter/db_converter.py
+++ b/lib/support/mysql-postgresql-converter/db_converter.py
@@ -15,7 +15,7 @@ import time
 import subprocess
  
  
-def parse(input_filename, output_filename):
+def parse(input_filename, output_filename, drop_index_filename):
    "Feed it a file, and it'll output a fixed one"
  
    # State storage
@@ -44,6 +44,8 @@ def parse(input_filename, output_filename):
        output = open(output_filename, "w")
        logging = sys.stdout
  
+    drop_index = open(drop_index_filename, "w")
+
    if input_filename == "-":
        input_fh = sys.stdin
    else:
@@ -234,12 +236,9 @@ def parse(input_filename, output_filename):
    for line in sequence_lines:
        output.write("%s;\n" % line)
  
-    # This line is an anchor for move_drop_indexes.ed
-    output.write("\n-- Drop indexes --\n")
+    drop_index.write("-- Drop indexes --\n")
    for line in drop_index_lines:
-        output.write("%s;\n" % line)
-    # This line is an anchor for move_drop_indexes.ed
-    output.write("-- END Drop indexes --\n")
+        drop_index.write("%s;\n" % line)
  
    # Write indexes out
    output.write("\n-- Indexes --\n")
@@ -253,4 +252,4 @@ def parse(input_filename, output_filename):
  
  
 if __name__ == "__main__":
-    parse(sys.argv[1], sys.argv[2])
+    parse(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/lib/support/mysql-postgresql-converter/splice_drop_indexes
+++ b/lib/support/mysql-postgresql-converter/splice_drop_indexes
+#!/bin/sh
+# This script reorders database dumps generated by db_converter.py for
+# efficient consumption by Postgres.
+
+fail() {
+  echo "$@" 1>2
+  exit 1
+}
+
+db_gz=$1
+drop_indexes_sql=$2
+
+if [ -z "$db_gz" ] || [ -z "$drop_indexes_sql" ] ; then
+  fail "Usage: $0 database.sql.gz drop_indexes.sql"
+fi
+
+# Capture all text up to the first occurence of 'SET CONSTRAINTS'
+preamble=$(zcat "$db_gz" | sed '/SET CONSTRAINTS/q')
+if [ -z "$preamble" ] ; then
+  fail "Could not read preamble"
+fi
+
+drop_indexes=$(cat "$drop_indexes_sql")
+if [ -z "$drop_indexes" ] ; then
+  fail "Could not read DROP INDEXES file"
+fi
+
+# Print preamble and drop indexes
+cat <<EOF
+${preamble}
+
+${drop_indexes}
+EOF
+
+# Print the rest of database.sql.gz. I don't understand this awk script but it
+# prints all lines after the first match of 'SET CONSTRAINTS'.
+zcat "$db_gz" | awk 'f; /SET CONSTRAINTS/ { f = 1 }'
--- a/lib/tasks/backup.rake
+++ b/lib/tasks/backup.rake
@@ -3,13 +3,14 @@ namespace :backup do
  desc "GITLAB | Create a backup of the GitLab CI database"
  task create: :environment do
    configure_cron_mode
+    mysql_to_postgresql = (ENV['MYSQL_TO_POSTGRESQL'] == '1')
  
    $progress.puts "Applying final database migrations ... ".blue
    Rake::Task['db:migrate'].invoke
    $progress.puts "done".green
  
    $progress.puts "Dumping database ... ".blue
-    Backup::Database.new.dump
+    Backup::Database.new.dump(mysql_to_postgresql)
    $progress.puts "done".green
  
    $progress.puts "Dumping builds ... ".blue