Vendor mysql-postgresql-converter

Source: https://github.com/gitlabhq/mysql-postgresql-converter.git Revision: dda8bc904596577ce158cf78eedfa11be2c26109

Vendor mysql-postgresql-converter
bf1e976d · Jacob Vosmaer · da1fd14e · bf1e976d · bf1e976d · bf1e976d
Commit bf1e976d authored 9 years ago by Jacob Vosmaer
--- a/lib/support/mysql-postgresql-converter/LICENSE
+++ b/lib/support/mysql-postgresql-converter/LICENSE
+Copyright (c) 2012 Lanyrd Inc.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/lib/support/mysql-postgresql-converter/README.md
+++ b/lib/support/mysql-postgresql-converter/README.md
+MySQL to PostgreSQL Converter
+=============================
+
+Lanyrd's MySQL to PostgreSQL conversion script. Use with care.
+
+This script was designed for our specific database and column requirements -
+notably, it doubles the lengths of VARCHARs due to a unicode size problem we
+had, places indexes on all foreign keys, and presumes you're using Django
+for column typing purposes.
+
+GitLab-specific changes
+-----------------------
+
+The `gitlab` branch of this fork contains the following changes made for
+GitLab.
+
+- Guard against replacing '0000-00-00 00:00:00' inside SQL text fields.
+- Replace all MySQL zero-byte string literals `\0`. This is safe as of GitLab
+  6.8 because the GitLab database schema contains no binary columns.
+- Never set 'NOT NULL' constraints on datetimes.
+- Drop sequences before creating them.
+- Preserve default values of boolean (originally `tinyint(1)`) columns.
+- Import all indexes.
+- Import index names.
+- Drop tables before creating.
+- Drop indexes before creating.
+
+How to use
+----------
+
+First, dump your MySQL database in PostgreSQL-compatible format
+
+    mysqldump --compatible=postgresql --default-character-set=utf8 \
+    -r databasename.mysql -u root gitlabhq_production -p
+
+Then, convert it using the dbconverter.py script.
+
+    python db_converter.py databasename.mysql databasename.psql
+
+It'll print progress to the terminal
+
+Now we have a DB dump that can be imported but the dump will be slow due
+to existing indexes. We use 'ed' to edit the DB dump file and move the
+'DROP INDEX' statements to the start of the import. Ed is not the fastest
+tool for this job if your DB dump is multiple gigabytes. (Patches to
+the converter are welcome!)
+
+    ed -s databasename.psql < move_drop_indexes.ed
+
+Next, load your new dump into a fresh PostgreSQL database using: 
+
+`psql -f databasename.psql -d gitlabhq_production`
+
+More information
+----------------
+
+You can learn more about the move which this powered at http://lanyrd.com/blog/2012/lanyrds-big-move/ and some technical details of it at http://www.aeracode.org/2012/11/13/one-change-not-enough/.
--- a/lib/support/mysql-postgresql-converter/db_converter.py
+++ b/lib/support/mysql-postgresql-converter/db_converter.py
+#!/usr/bin/env python
+
+"""
+Fixes a MySQL dump made with the right format so it can be directly
+imported to a new PostgreSQL database.
+
+Dump using:
+mysqldump --compatible=postgresql --default-character-set=utf8 -r databasename.mysql -u root databasename
+"""
+
+import re
+import sys
+import os
+import time
+import subprocess
+
+
+def parse(input_filename, output_filename):
+    "Feed it a file, and it'll output a fixed one"
+
+    # State storage
+    if input_filename == "-":
+        num_lines = -1
+    else:
+        num_lines = int(subprocess.check_output(["wc", "-l", input_filename]).strip().split()[0])
+    tables = {}
+    current_table = None
+    creation_lines = []
+    enum_types = []
+    foreign_key_lines = []
+    index_lines = []
+    drop_index_lines = []
+    sequence_lines = []
+    cast_lines = []
+    num_inserts = 0
+    started = time.time()
+
+    # Open output file and write header. Logging file handle will be stdout
+    # unless we're writing output to stdout, in which case NO PROGRESS FOR YOU.
+    if output_filename == "-":
+        output = sys.stdout
+        logging = open(os.devnull, "w")
+    else:
+        output = open(output_filename, "w")
+        logging = sys.stdout
+
+    if input_filename == "-":
+        input_fh = sys.stdin
+    else:
+        input_fh = open(input_filename)
+
+
+    output.write("-- Converted by db_converter\n")
+    output.write("START TRANSACTION;\n")
+    output.write("SET standard_conforming_strings=off;\n")
+    output.write("SET escape_string_warning=off;\n")
+    output.write("SET CONSTRAINTS ALL DEFERRED;\n\n")
+
+    for i, line in enumerate(input_fh):
+        time_taken = time.time() - started
+        percentage_done = (i+1) / float(num_lines)
+        secs_left = (time_taken / percentage_done) - time_taken
+        logging.write("\rLine %i (of %s: %.2f%%) [%s tables] [%s inserts] [ETA: %i min %i sec]" % (
+            i + 1,
+            num_lines,
+            ((i+1)/float(num_lines))*100,
+            len(tables),
+            num_inserts,
+            secs_left // 60,
+            secs_left % 60,
+        ))
+        logging.flush()
+        line = line.decode("utf8").strip().replace(r"\\", "WUBWUBREALSLASHWUB").replace(r"\0", "").replace(r"\'", "''").replace("WUBWUBREALSLASHWUB", r"\\")
+        # Ignore comment lines
+        if line.startswith("--") or line.startswith("/*") or line.startswith("LOCK TABLES") or line.startswith("DROP TABLE") or line.startswith("UNLOCK TABLES") or not line:
+            continue
+
+        # Outside of anything handling
+        if current_table is None:
+            # Start of a table creation statement?
+            if line.startswith("CREATE TABLE"):
+                current_table = line.split('"')[1]
+                tables[current_table] = {"columns": []}
+                creation_lines = []
+            # Inserting data into a table?
+            elif line.startswith("INSERT INTO"):
+                output.write(re.sub(r"([^'])'0000-00-00 00:00:00'", r"\1NULL", line.encode("utf8")) + "\n")
+                num_inserts += 1
+            # ???
+            else:
+                print "\n ! Unknown line in main body: %s" % line
+
+        # Inside-create-statement handling
+        else:
+            # Is it a column?
+            if line.startswith('"'):
+                useless, name, definition = line.strip(",").split('"',2)
+                try:
+                    type, extra = definition.strip().split(" ", 1)
+
+                    # This must be a tricky enum
+                    if ')' in extra:
+                        type, extra = definition.strip().split(")")
+
+                except ValueError:
+                    type = definition.strip()
+                    extra = ""
+                extra = re.sub("CHARACTER SET [\w\d]+\s*", "", extra.replace("unsigned", ""))
+                extra = re.sub("COLLATE [\w\d]+\s*", "", extra.replace("unsigned", ""))
+
+                # See if it needs type conversion
+                final_type = None
+                final_default = None
+                set_sequence = None
+                if type == "tinyint(1)":
+                    type = "int4"
+                    set_sequence = True
+                    final_type = "boolean"
+
+                    if "DEFAULT '0'" in extra:
+                        final_default = "FALSE"
+                    elif "DEFAULT '1'" in extra:
+                        final_default = "TRUE"
+
+                elif type.startswith("int("):
+                    type = "integer"
+                    set_sequence = True
+                elif type.startswith("bigint("):
+                    type = "bigint"
+                    set_sequence = True
+                elif type == "longtext":
+                    type = "text"
+                elif type == "mediumtext":
+                    type = "text"
+                elif type == "tinytext":
+                    type = "text"
+                elif type.startswith("varchar("):
+                    size = int(type.split("(")[1].rstrip(")"))
+                    type = "varchar(%s)" % (size * 2)
+                elif type.startswith("smallint("):
+                    type = "int2"
+                    set_sequence = True
+                elif type == "datetime":
+                    type = "timestamp with time zone"
+                    extra = extra.replace("NOT NULL", "")
+                elif type == "double":
+                    type = "double precision"
+                elif type == "blob":
+                    type = "bytea"
+                elif type.startswith("enum(") or type.startswith("set("):
+
+                    types_str = type.split("(")[1].rstrip(")").rstrip('"')
+                    types_arr = [type_str.strip('\'') for type_str in types_str.split(",")]
+
+                    # Considered using values to make a name, but its dodgy
+                    # enum_name = '_'.join(types_arr)
+                    enum_name = "{0}_{1}".format(current_table, name)
+
+                    if enum_name not in enum_types:
+                        output.write("CREATE TYPE {0} AS ENUM ({1}); \n".format(enum_name, types_str));
+                        enum_types.append(enum_name)
+
+                    type = enum_name
+
+                if final_type:
+                    cast_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"%s\" DROP DEFAULT" % (current_table, name))
+                    cast_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"%s\" TYPE %s USING CAST(\"%s\" as %s)" % (current_table, name, final_type, name, final_type))
+                    if final_default:
+                        cast_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"%s\" SET DEFAULT %s" % (current_table, name, final_default))
+                # ID fields need sequences [if they are integers?]
+                if name == "id" and set_sequence is True:
+                    sequence_lines.append("DROP SEQUENCE IF EXISTS %s_id_seq" % (current_table))
+                    sequence_lines.append("CREATE SEQUENCE %s_id_seq" % (current_table))
+                    sequence_lines.append("SELECT setval('%s_id_seq', max(id)) FROM %s" % (current_table, current_table))
+                    sequence_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"id\" SET DEFAULT nextval('%s_id_seq')" % (current_table, current_table))
+                # Record it
+                creation_lines.append('"%s" %s %s' % (name, type, extra))
+                tables[current_table]['columns'].append((name, type, extra))
+            # Is it a constraint or something?
+            elif line.startswith("PRIMARY KEY"):
+                creation_lines.append(line.rstrip(","))
+            elif line.startswith("CONSTRAINT"):
+                foreign_key_lines.append("ALTER TABLE \"%s\" ADD CONSTRAINT %s DEFERRABLE INITIALLY DEFERRED" % (current_table, line.split("CONSTRAINT")[1].strip().rstrip(",")))
+                foreign_key_lines.append("CREATE INDEX ON \"%s\" %s" % (current_table, line.split("FOREIGN KEY")[1].split("REFERENCES")[0].strip().rstrip(",")))
+            elif line.startswith("UNIQUE KEY \""):
+                index_name      = line.split('"')[1].split('"')[0]
+                index_columns   = line.split("(")[1].split(")")[0]
+                index_lines.append("CREATE UNIQUE INDEX \"%s\" ON %s (%s)" % (index_name, current_table, index_columns))
+                drop_index_lines.append("DROP INDEX IF EXISTS \"%s\"" % index_name)
+            elif line.startswith("UNIQUE KEY"):
+                index_columns   = line.split("(")[1].split(")")[0]
+                index_lines.append("CREATE UNIQUE INDEX ON %s (%s)" % (current_table, index_columns))
+            elif line.startswith("KEY \""):
+                index_name      = line.split('"')[1].split('"')[0]
+                index_columns   = line.split("(")[1].split(")")[0]
+                index_lines.append("CREATE INDEX \"%s\" ON %s (%s)" % (index_name, current_table, index_columns))
+                drop_index_lines.append("DROP INDEX IF EXISTS \"%s\"" % index_name)
+            elif line.startswith("KEY"):
+                index_columns = line.split("(")[1].split(")")[0]
+                index_lines.append("CREATE INDEX ON %s (%s)" % (current_table, index_columns))
+            elif line.startswith("FULLTEXT KEY"):
+                fulltext_keys = " || ' ' || ".join( line.split('(')[-1].split(')')[0].replace('"', '').split(',') )
+                index_lines.append("CREATE INDEX ON %s USING gin(to_tsvector('english', %s))" % (current_table, fulltext_keys))
+            # Is it the end of the table?
+            elif line == ");":
+                output.write("DROP TABLE IF EXISTS \"%s\";\n" % current_table)
+                output.write("CREATE TABLE \"%s\" (\n" % current_table)
+                for i, line in enumerate(creation_lines):
+                    output.write("    %s%s\n" % (line, "," if i != (len(creation_lines) - 1) else ""))
+                output.write(');\n\n')
+                current_table = None
+            # ???
+            else:
+                print "\n ! Unknown line inside table creation: %s" % line
+
+
+    # Finish file
+    output.write("\n-- Post-data save --\n")
+    output.write("COMMIT;\n")
+    output.write("START TRANSACTION;\n")
+
+    # Write typecasts out
+    output.write("\n-- Typecasts --\n")
+    for line in cast_lines:
+        output.write("%s;\n" % line)
+
+    # Write FK constraints out
+    output.write("\n-- Foreign keys --\n")
+    for line in foreign_key_lines:
+        output.write("%s;\n" % line)
+
+    # Write sequences out
+    output.write("\n-- Sequences --\n")
+    for line in sequence_lines:
+        output.write("%s;\n" % line)
+
+    # This line is an anchor for move_drop_indexes.ed
+    output.write("\n-- Drop indexes --\n")
+    for line in drop_index_lines:
+        output.write("%s;\n" % line)
+    # This line is an anchor for move_drop_indexes.ed
+    output.write("-- END Drop indexes --\n")
+
+    # Write indexes out
+    output.write("\n-- Indexes --\n")
+    for line in index_lines:
+        output.write("%s;\n" % line)
+
+    # Finish file
+    output.write("\n")
+    output.write("COMMIT;\n")
+    print ""
+
+
+if __name__ == "__main__":
+    parse(sys.argv[1], sys.argv[2])
--- a/lib/support/mysql-postgresql-converter/move_drop_indexes.ed
+++ b/lib/support/mysql-postgresql-converter/move_drop_indexes.ed
+# Look for the 'Drop indexes' line and mark it with 'a'
+/^-- Drop indexes --$/
+ka
+# Look for the 'END Drop indexes' line and mark it with 'b'
+/^-- END Drop indexes --$/
+kb
+# Move the 'Drop indexes'..'END Drop indexes' block after the line containing
+# 'SET CONSTRAINTS'.
+1
+/SET CONSTRAINTS/
+'a,'bm.
+wq