Skip to content
Snippets Groups Projects
Commit 9417aaa4 authored by Nick Thomas's avatar Nick Thomas
Browse files

Merge branch '1-initial-implementation' into 'master'

Initial implementation of an elasticsearch indexer in Go

Closes #1

See merge request !1
parents be84ab1b e9fd598f
No related branches found
No related tags found
1 merge request!1Initial implementation of an elasticsearch indexer in Go
Pipeline #
/es-git-go
/.GOPATH
/bin
/tmp
/vendor/github.com/stretchr/testify
/vendor/github.com/wadey/gocovmerge
/vendor/golang.org/x/tools
/vendor/manifest
.test: &test
services:
- elasticsearch:5.1
variables:
V: "1"
ELASTIC_CONNECTION_INFO: '{"url":["http://elasticsearch:9200"]}'
stage: test
script:
- apt-get update && apt-get -yy install libicu-dev
- make setup
- make format
- make cover
- make
- make test
test 1.7:
<<: *test
image: golang:1.7
test 1.8:
<<: *test
image: golang:1.8
Makefile 0 → 100644
# The import path is where your repository can be found.
# To import subpackages, always prepend the full import path.
# If you change this, run `make clean`. Read more: https://git.io/vM7zV
IMPORT_PATH := gitlab.com/gitlab-org/es-git-go
GO15VENDOREXPERIMENT := 1
# V := 1 # When V is set, print commands and build progress.
# Space separated patterns of packages to skip in list, test, format.
IGNORED_PACKAGES := /vendor/
.PHONY: all
all: build
.PHONY: build
build: .GOPATH/.ok
$Q go install $(if $V,-v) $(VERSION_FLAGS) $(IMPORT_PATH)
### Code not in the repository root? Another binary? Add to the path like this.
# .PHONY: otherbin
# otherbin: .GOPATH/.ok
# $Q go install $(if $V,-v) $(VERSION_FLAGS) $(IMPORT_PATH)/cmd/otherbin
##### ^^^^^^ EDIT ABOVE ^^^^^^ #####
##### =====> Utility targets <===== #####
.PHONY: clean test list cover format
clean:
$Q rm -rf bin .GOPATH tmp
test: .GOPATH/.ok
$Q go test $(if $V,-v) -i -race $(allpackages) # install -race libs to speed up next run
ifndef CI
$Q go vet $(allpackages)
$Q GODEBUG=cgocheck=2 go test $(if $V,-v) -race $(allpackages)
else
$Q ( go vet $(allpackages); echo $$? ) | \
tee .GOPATH/test/vet.txt | sed '$$ d'; exit $$(tail -1 .GOPATH/test/vet.txt)
$Q ( GODEBUG=cgocheck=2 go test -v -race $(allpackages); echo $$? ) | \
tee .GOPATH/test/output.txt | sed '$$ d'; exit $$(tail -1 .GOPATH/test/output.txt)
endif
list: .GOPATH/.ok
@echo $(allpackages)
cover: bin/gocovmerge .GOPATH/.ok
@echo "NOTE: make cover does not exit 1 on failure, don't use it to check for tests success!"
$Q rm -f .GOPATH/cover/*.out .GOPATH/cover/all.merged
$(if $V,@echo "-- go test -coverpkg=./... -coverprofile=.GOPATH/cover/... ./...")
@for MOD in $(allpackages); do \
go test -coverpkg=`echo $(allpackages)|tr " " ","` \
-coverprofile=.GOPATH/cover/unit-`echo $$MOD|tr "/" "_"`.out \
$$MOD 2>&1 | grep -v "no packages being tested depend on"; \
done
$Q ./bin/gocovmerge .GOPATH/cover/*.out > .GOPATH/cover/all.merged
ifndef CI
$Q go tool cover -html .GOPATH/cover/all.merged
else
$Q go tool cover -html .GOPATH/cover/all.merged -o .GOPATH/cover/all.html
endif
@echo ""
@echo "=====> Total test coverage: <====="
@echo ""
$Q go tool cover -func .GOPATH/cover/all.merged
format: bin/goimports .GOPATH/.ok
$Q find .GOPATH/src/$(IMPORT_PATH)/ -iname \*.go | grep -v \
-e "^$$" $(addprefix -e ,$(IGNORED_PACKAGES)) | xargs ./bin/goimports -w
##### =====> Internals <===== #####
.PHONY: setup
setup: clean .GOPATH/.ok
@if ! grep "/.GOPATH" .gitignore > /dev/null 2>&1; then \
echo "/.GOPATH" >> .gitignore; \
echo "/bin" >> .gitignore; \
fi
go get -u github.com/FiloSottile/gvt
- ./bin/gvt fetch golang.org/x/tools/cmd/goimports
- ./bin/gvt fetch github.com/wadey/gocovmerge
- ./bin/gvt fetch github.com/stretchr/testify/assert
- mkdir tmp
- git clone --bare https://gitlab.com/gitlab-org/gitlab-test.git tmp/gitlab-test.git
VERSION := $(shell git describe --tags --always --dirty="-dev")
DATE := $(shell date -u '+%Y-%m-%d-%H%M UTC')
VERSION_FLAGS := -ldflags='-X "main.Version=$(VERSION)" -X "main.BuildTime=$(DATE)"'
# cd into the GOPATH to workaround ./... not following symlinks
_allpackages = $(shell ( cd $(CURDIR)/.GOPATH/src/$(IMPORT_PATH) && \
GOPATH=$(CURDIR)/.GOPATH go list ./... 2>&1 1>&3 | \
grep -v -e "^$$" $(addprefix -e ,$(IGNORED_PACKAGES)) 1>&2 ) 3>&1 | \
grep -v -e "^$$" $(addprefix -e ,$(IGNORED_PACKAGES)))
# memoize allpackages, so that it's executed only once and only if used
allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages)
export GOPATH := $(CURDIR)/.GOPATH
unexport GOBIN
Q := $(if $V,,@)
.GOPATH/.ok:
$Q mkdir -p "$(dir .GOPATH/src/$(IMPORT_PATH))"
$Q ln -s ../../../.. ".GOPATH/src/$(IMPORT_PATH)"
$Q mkdir -p .GOPATH/test .GOPATH/cover
$Q mkdir -p bin
$Q ln -s ../bin .GOPATH/bin
$Q touch $@
.PHONY: bin/gocovmerge bin/goimports
bin/gocovmerge: .GOPATH/.ok
@test -d ./vendor/github.com/wadey/gocovmerge || \
{ echo "Vendored gocovmerge not found, try running 'make setup'..."; exit 1; }
$Q go install $(IMPORT_PATH)/vendor/github.com/wadey/gocovmerge
bin/goimports: .GOPATH/.ok
@test -d ./vendor/golang.org/x/tools/cmd/goimports || \
{ echo "Vendored goimports not found, try running 'make setup'..."; exit 1; }
$Q go install $(IMPORT_PATH)/vendor/golang.org/x/tools/cmd/goimports
# Based on https://github.com/cloudflare/hellogopher - v1.1 - MIT License
#
# Copyright (c) 2017 Cloudflare
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# GitLab Elasticsearch Indexer
This project indexes Git repositories into Elasticsearch for GitLab. See the
[homepage](https://gitlab.com/gitlab-org/es-git-go) for more information.
## Building
This project relies on [ICU](http://site.icu-project.org/) for text encoding;
ensure the development packages for your platform are installed before running
`make`:
### Debian / Ubuntu
```
# apt install libicu-dev
```
### Mac OSX
```
$ brew install icu4c
$ export PKG_CONFIG_PATH="/usr/local/opt/icu4c/lib/pkgconfig:$PKG_CONFIG_PATH"
```
package elastic
import (
"context"
"fmt"
"net/http"
"os"
"strings"
"github.com/aws/aws-sdk-go/aws/credentials"
"github.com/aws/aws-sdk-go/aws/signer/v4"
"github.com/deoxxa/aws_signing_client"
"gopkg.in/olivere/elastic.v5"
)
var (
timeoutError = fmt.Errorf("Timeout")
)
const (
// TODO: make this configurable / detectable.
// Limiting to 10MiB lets us work on small AWS clusters, but unnecessarily
// increases round trips in larger or non-AWS clusters
MaxBulkSize = 10 * 1024 * 1024
BulkWorkers = 10
)
type Client struct {
IndexName string
ProjectID string
Client *elastic.Client
bulk *elastic.BulkProcessor
}
// FromEnv creates an Elasticsearch client from the `ELASTIC_CONNECTION_INFO`
// environment variable
func FromEnv(projectID string) (*Client, error) {
data := strings.NewReader(os.Getenv("ELASTIC_CONNECTION_INFO"))
config, err := ReadConfig(data)
if err != nil {
return nil, fmt.Errorf("Couldn't parse ELASTIC_CONNECTION_INFO: %s", err)
}
railsEnv := os.Getenv("RAILS_ENV")
indexName := "gitlab"
if railsEnv != "" {
indexName = indexName + "-" + railsEnv
}
config.IndexName = indexName
config.ProjectID = projectID
return NewClient(config)
}
func NewClient(config *Config) (*Client, error) {
var opts []elastic.ClientOptionFunc
// AWS settings have to come first or they override custom URL, etc
if config.AWS {
credentials := credentials.NewStaticCredentials(config.AccessKey, config.SecretKey, "")
signer := v4.NewSigner(credentials)
awsClient, err := aws_signing_client.New(signer, &http.Client{}, "es", config.Region)
if err != nil {
return nil, err
}
opts = append(opts, elastic.SetHttpClient(awsClient))
}
// Sniffer should look for HTTPS URLs if at-least-one initial URL is HTTPS
for _, url := range config.URL {
if strings.HasPrefix(url, "https:") {
opts = append(opts, elastic.SetScheme("https"))
break
}
}
opts = append(opts, elastic.SetURL(config.URL...), elastic.SetSniff(false))
client, err := elastic.NewClient(opts...)
if err != nil {
return nil, err
}
bulk, err := client.BulkProcessor().
Workers(BulkWorkers).
BulkSize(MaxBulkSize).
Do(context.Background())
if err != nil {
return nil, err
}
return &Client{
IndexName: config.IndexName,
ProjectID: config.ProjectID,
Client: client,
bulk: bulk,
}, nil
}
func (c *Client) ParentID() string {
return c.ProjectID
}
func (c *Client) Flush() error {
return c.bulk.Flush()
}
func (c *Client) Close() {
c.Client.Stop()
}
func (c *Client) Index(id string, thing interface{}) {
req := elastic.NewBulkIndexRequest().
Index(c.IndexName).
Type("repository").
Parent(c.ProjectID).
Id(id).
Doc(thing)
c.bulk.Add(req)
}
// We only really use this for tests
func (c *Client) Get(id string) (*elastic.GetResult, error) {
return c.Client.Get().
Index(c.IndexName).
Type("repository").
Id(id).
Routing(c.ProjectID).
Do(context.TODO())
}
func (c *Client) GetCommit(id string) (*elastic.GetResult, error) {
return c.Get(c.ProjectID + "_" + id)
}
func (c *Client) GetBlob(path string) (*elastic.GetResult, error) {
return c.Get(c.ProjectID + "_" + path)
}
func (c *Client) Remove(id string) {
req := elastic.NewBulkDeleteRequest().
Index(c.IndexName).
Type("repository").
Parent(c.ProjectID).
Id(id)
c.bulk.Add(req)
}
package elastic_test
import (
"crypto/tls"
"fmt"
"net/http"
"net/http/httptest"
"os"
"regexp"
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/elastic"
)
const (
projectID = "667"
)
func TestAWSConfiguration(t *testing.T) {
var req *http.Request
// httptest certificate is unsigned
transport := http.DefaultTransport
defer func() { http.DefaultTransport = transport }()
http.DefaultTransport = &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}
f := func(w http.ResponseWriter, r *http.Request) {
req = r
w.Header().Set("Content-Type", "application/json")
w.Write([]byte(`{}`))
}
srv := httptest.NewTLSServer(http.HandlerFunc(f))
defer srv.Close()
config, err := elastic.ReadConfig(strings.NewReader(
`{
"url":["` + srv.URL + `"],
"aws":true,
"aws_region": "us-east-1",
"aws_access_key": "0",
"aws_secret_access_key": "0"
}`,
))
assert.NoError(t, err)
client, err := elastic.NewClient(config)
assert.NoError(t, err)
defer client.Close()
if assert.NotNil(t, req) {
authRE := regexp.MustCompile(`\AAWS4-HMAC-SHA256 Credential=0/\d{8}/us-east-1/es/aws4_request, SignedHeaders=accept;date;host;x-amz-date, Signature=[a-f0-9]{64}\z`)
assert.Regexp(t, authRE, req.Header.Get("Authorization"))
assert.NotEqual(t, "", req.Header.Get("X-Amz-Date"))
}
}
func TestElasticClientIndexAndRetrieval(t *testing.T) {
config := os.Getenv("ELASTIC_CONNECTION_INFO")
if config == "" {
t.Log("ELASTIC_CONNECTION_INFO not set")
t.SkipNow()
}
os.Setenv("RAILS_ENV", fmt.Sprintf("test-elastic-%d", time.Now().Unix()))
client, err := elastic.FromEnv(projectID)
assert.NoError(t, err)
assert.Equal(t, projectID, client.ParentID())
assert.NoError(t, client.CreateIndex())
blobDoc := map[string]interface{}{}
client.Index(projectID+"_foo", blobDoc)
commitDoc := map[string]interface{}{}
client.Index(projectID+"_0000", commitDoc)
assert.NoError(t, client.Flush())
blob, err := client.GetBlob("foo")
assert.NoError(t, err)
assert.Equal(t, true, blob.Found)
commit, err := client.GetCommit("0000")
assert.NoError(t, err)
assert.Equal(t, true, commit.Found)
client.Remove(projectID + "_foo")
assert.NoError(t, client.Flush())
_, err = client.GetBlob("foo")
assert.Error(t, err)
assert.NoError(t, client.DeleteIndex())
}
package elastic
import (
"encoding/json"
"io"
)
type Config struct {
IndexName string `json:"-"`
ProjectID string `json:"-"`
URL []string `json:"url"`
AWS bool `json:"aws"`
Region string `json:"aws_region"`
AccessKey string `json:"aws_access_key"`
SecretKey string `json:"aws_secret_access_key"`
}
func ReadConfig(r io.Reader) (*Config, error) {
var out Config
if err := json.NewDecoder(r).Decode(&out); err != nil {
return nil, err
}
return &out, nil
}
package elastic
import (
"context"
)
var indexMapping = `
{
"settings": {
"analysis": {
"filter": {
"my_stemmer": {
"name": "light_english",
"type": "stemmer"
},
"code": {
"type": "pattern_capture",
"preserve_original": "1",
"patterns": [
"(\\p{Ll}+|\\p{Lu}\\p{Ll}+|\\p{Lu}+)",
"(\\d+)"
]
}
},
"char_filter": {
"code_mapping": {
"type": "mapping",
"mappings": [
". => ' '"
]
}
},
"analyzer": {
"default": {
"filter": [
"standard",
"lowercase",
"my_stemmer"
],
"tokenizer": "standard"
},
"code_search_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"char_filter": [
"code_mapping"
],
"type": "custom",
"tokenizer": "standard"
},
"path_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"type": "custom",
"tokenizer": "path_tokenizer"
},
"sha_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"type": "custom",
"tokenizer": "sha_tokenizer"
},
"code_analyzer": {
"filter": [
"code",
"lowercase",
"asciifolding"
],
"char_filter": [
"code_mapping"
],
"type": "custom",
"tokenizer": "standard"
},
"my_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "my_ngram_tokenizer"
}
},
"tokenizer": {
"my_ngram_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "2",
"type": "nGram",
"max_gram": "3"
},
"sha_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "5",
"type": "edgeNGram",
"max_gram": "40"
},
"path_tokenizer": {
"reverse": "true",
"type": "path_hierarchy"
}
}
}
},
"mappings": {
"milestone": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"created_at": {
"type": "date"
},
"description": {
"type": "text",
"index_options": "offsets"
},
"id": {
"type": "integer"
},
"project_id": {
"type": "integer"
},
"title": {
"type": "text",
"index_options": "offsets"
},
"updated_at": {
"type": "date"
}
}
},
"note": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"created_at": {
"type": "date"
},
"id": {
"type": "integer"
},
"issue": {
"properties": {
"assignee_id": {
"type": "integer"
},
"author_id": {
"type": "integer"
},
"confidential": {
"type": "boolean"
}
}
},
"note": {
"type": "text",
"index_options": "offsets"
},
"noteable_id": {
"type": "integer"
},
"noteable_type": {
"type": "keyword"
},
"project_id": {
"type": "integer"
},
"updated_at": {
"type": "date"
}
}
},
"project_wiki": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"blob": {
"properties": {
"commit_sha": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"content": {
"type": "text",
"index_options": "offsets",
"analyzer": "code_analyzer",
"search_analyzer": "code_search_analyzer"
},
"file_name": {
"type": "text",
"analyzer": "code_analyzer",
"search_analyzer": "code_search_analyzer"
},
"id": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"language": {
"type": "keyword"
},
"oid": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"path": {
"type": "text",
"analyzer": "path_analyzer"
},
"rid": {
"type": "keyword"
}
}
},
"commit": {
"properties": {
"author": {
"properties": {
"email": {
"type": "text",
"index_options": "offsets"
},
"name": {
"type": "text",
"index_options": "offsets"
},
"time": {
"type": "date",
"format": "basic_date_time_no_millis"
}
}
},
"commiter": {
"properties": {
"email": {
"type": "text",
"index_options": "offsets"
},
"name": {
"type": "text",
"index_options": "offsets"
},
"time": {
"type": "date",
"format": "basic_date_time_no_millis"
}
}
},
"id": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"message": {
"type": "text",
"index_options": "offsets"
},
"rid": {
"type": "keyword"
},
"sha": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
}
}
}
}
},
"issue": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"assignee_id": {
"type": "integer"
},
"author_id": {
"type": "integer"
},
"confidential": {
"type": "boolean"
},
"created_at": {
"type": "date"
},
"description": {
"type": "text",
"index_options": "offsets"
},
"id": {
"type": "integer"
},
"iid": {
"type": "integer"
},
"project_id": {
"type": "integer"
},
"state": {
"type": "text"
},
"title": {
"type": "text",
"index_options": "offsets"
},
"updated_at": {
"type": "date"
}
}
},
"merge_request": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"author_id": {
"type": "integer"
},
"created_at": {
"type": "date"
},
"description": {
"type": "text",
"index_options": "offsets"
},
"id": {
"type": "integer"
},
"iid": {
"type": "integer"
},
"merge_status": {
"type": "text"
},
"source_branch": {
"type": "text",
"index_options": "offsets"
},
"source_project_id": {
"type": "integer"
},
"state": {
"type": "text"
},
"target_branch": {
"type": "text",
"index_options": "offsets"
},
"target_project_id": {
"type": "integer"
},
"title": {
"type": "text",
"index_options": "offsets"
},
"updated_at": {
"type": "date"
}
}
},
"repository": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"blob": {
"properties": {
"commit_sha": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"content": {
"type": "text",
"index_options": "offsets",
"analyzer": "code_analyzer",
"search_analyzer": "code_search_analyzer"
},
"file_name": {
"type": "text",
"analyzer": "code_analyzer",
"search_analyzer": "code_search_analyzer"
},
"id": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"language": {
"type": "keyword"
},
"oid": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"path": {
"type": "text",
"analyzer": "path_analyzer"
},
"rid": {
"type": "keyword"
},
"type": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"commit": {
"properties": {
"author": {
"properties": {
"email": {
"type": "text",
"index_options": "offsets"
},
"name": {
"type": "text",
"index_options": "offsets"
},
"time": {
"type": "date",
"format": "basic_date_time_no_millis"
}
}
},
"commiter": {
"properties": {
"email": {
"type": "text",
"index_options": "offsets"
},
"name": {
"type": "text",
"index_options": "offsets"
},
"time": {
"type": "date",
"format": "basic_date_time_no_millis"
}
}
},
"committer": {
"properties": {
"email": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"time": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"id": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"message": {
"type": "text",
"index_options": "offsets"
},
"rid": {
"type": "keyword"
},
"sha": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"type": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
},
"project": {
"properties": {
"archived": {
"type": "boolean"
},
"created_at": {
"type": "date"
},
"description": {
"type": "text",
"index_options": "offsets"
},
"id": {
"type": "integer"
},
"issues_access_level": {
"type": "integer"
},
"last_activity_at": {
"type": "date"
},
"last_pushed_at": {
"type": "date"
},
"merge_requests_access_level": {
"type": "integer"
},
"name": {
"type": "text",
"index_options": "offsets"
},
"name_with_namespace": {
"type": "text",
"index_options": "offsets",
"analyzer": "my_ngram_analyzer"
},
"namespace_id": {
"type": "integer"
},
"path": {
"type": "text",
"index_options": "offsets"
},
"path_with_namespace": {
"type": "text",
"index_options": "offsets"
},
"repository_access_level": {
"type": "integer"
},
"snippets_access_level": {
"type": "integer"
},
"updated_at": {
"type": "date"
},
"visibility_level": {
"type": "integer"
},
"wiki_access_level": {
"type": "integer"
}
}
},
"snippet": {
"properties": {
"author_id": {
"type": "integer"
},
"content": {
"type": "text",
"index_options": "offsets"
},
"created_at": {
"type": "date"
},
"file_name": {
"type": "text",
"index_options": "offsets"
},
"id": {
"type": "integer"
},
"project_id": {
"type": "integer"
},
"state": {
"type": "text"
},
"title": {
"type": "text",
"index_options": "offsets"
},
"updated_at": {
"type": "date"
},
"visibility_level": {
"type": "integer"
}
}
}
}
}
`
// CreateIndex creates an index matching that created by gitlab-elasticsearch-git v1.1.1
func (c *Client) CreateIndex() error {
createIndex, err := c.Client.CreateIndex(c.IndexName).BodyString(indexMapping).Do(context.Background())
if err != nil {
return err
}
if !createIndex.Acknowledged {
return timeoutError
}
return nil
}
func (c *Client) DeleteIndex() error {
deleteIndex, err := c.Client.DeleteIndex(c.IndexName).Do(context.Background())
if err != nil {
return err
}
if !deleteIndex.Acknowledged {
return timeoutError
}
return nil
}
package git
import (
"fmt"
"gopkg.in/src-d/go-git.v4"
"gopkg.in/src-d/go-git.v4/plumbing"
"gopkg.in/src-d/go-git.v4/plumbing/filemode"
"gopkg.in/src-d/go-git.v4/plumbing/object"
"gopkg.in/src-d/go-git.v4/utils/merkletrie"
)
var (
endError = fmt.Errorf("Finished") // not really an error
)
type goGitRepository struct {
*git.Repository
FromHash plumbing.Hash
ToHash plumbing.Hash
FromCommit *object.Commit
ToCommit *object.Commit
}
func NewGoGitRepository(projectPath string, fromSHA string, toSHA string) (*goGitRepository, error) {
out := &goGitRepository{}
repo, err := git.PlainOpen(projectPath)
if err != nil {
return nil, err
}
out.Repository = repo
if fromSHA == "" {
out.FromHash = plumbing.ZeroHash
} else {
out.FromHash = plumbing.NewHash(fromSHA)
commit, err := repo.CommitObject(out.FromHash)
if err != nil {
return nil, fmt.Errorf("Bad from SHA (%s): %s", out.FromHash, err)
}
out.FromCommit = commit
}
if toSHA == "" {
ref, err := out.Repository.Head()
if err != nil {
return nil, err
}
out.ToHash = ref.Hash()
} else {
out.ToHash = plumbing.NewHash(toSHA)
}
commit, err := out.Repository.CommitObject(out.ToHash)
if err != nil {
return nil, fmt.Errorf("Bad to SHA (%s): %s", out.ToHash, err)
}
out.ToCommit = commit
return out, nil
}
func (r *goGitRepository) diff() (object.Changes, error) {
var fromTree, toTree *object.Tree
if r.FromCommit != nil {
tree, err := r.FromCommit.Tree()
if err != nil {
return nil, err
}
fromTree = tree
}
toTree, err := r.ToCommit.Tree()
if err != nil {
return nil, err
}
return object.DiffTree(fromTree, toTree)
}
func goGitBuildSignature(sig object.Signature) Signature {
return Signature{
Name: sig.Name,
Email: sig.Email,
When: sig.When,
}
}
func goGitBuildFile(change object.ChangeEntry, file *object.File) *File {
return &File{
Path: change.Name,
Oid: file.ID().String(),
Blob: file.Blob.Reader,
Size: file.Size,
}
}
func (r *goGitRepository) EachFileChange(ins, mod, del FileFunc) error {
changes, err := r.diff()
if err != nil {
return err
}
fromCommitStr := r.FromHash.String()
toCommitStr := r.ToHash.String()
for _, change := range changes {
// FIXME(nick): submodules may need better support
// https://github.com/src-d/go-git/issues/317
if change.From.TreeEntry.Mode == filemode.Submodule || change.To.TreeEntry.Mode == filemode.Submodule {
continue
}
fromF, toF, err := change.Files()
if err != nil {
return err
}
action, err := change.Action()
if err != nil {
return err
}
switch action {
case merkletrie.Insert:
err = ins(goGitBuildFile(change.To, toF), fromCommitStr, toCommitStr)
case merkletrie.Modify:
err = mod(goGitBuildFile(change.To, toF), fromCommitStr, toCommitStr)
case merkletrie.Delete:
err = del(goGitBuildFile(change.From, fromF), fromCommitStr, toCommitStr)
default:
err = fmt.Errorf("Unrecognised change calculating diff: %+v", change)
}
if err != nil {
return err
}
}
return nil
}
// EachCommit runs `f` for each commit within `fromSHA`..`toSHA`
// go-git doesn't directly support ranges of revisions, so we emulate this by
// walking the commit history between from and to
func (r *goGitRepository) EachCommit(f CommitFunc) error {
err := object.WalkCommitHistoryPost(r.ToCommit, func(c *object.Commit) error {
if r.FromCommit != nil && c.ID() == r.FromCommit.ID() {
return endError
}
commit := &Commit{
Message: c.Message,
Hash: c.Hash.String(),
Author: goGitBuildSignature(c.Author),
Committer: goGitBuildSignature(c.Committer),
}
if err := f(commit); err != nil {
return err
}
return nil
})
if err != nil && err != endError {
return fmt.Errorf("WalkCommitHistory: %s", err)
}
return nil
}
package git
import (
"io"
"time"
)
type File struct {
Path string
Blob func() (io.ReadCloser, error)
Oid string
Size int64
}
type Signature struct {
Name string
Email string
When time.Time
}
type Commit struct {
Author Signature
Committer Signature
Message string
Hash string
}
type Repository interface {
EachFileChange(ins, mod, del FileFunc) error
EachCommit(f CommitFunc) error
}
type FileFunc func(file *File, fromCommit, toCommit string) error
type CommitFunc func(commit *Commit) error
package git_test
import (
"flag"
"io/ioutil"
"os"
"sort"
"testing"
"time"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/git"
)
var (
testRepo = flag.String("test-repo", "../tmp/gitlab-test.git", "Path to `gitlab-test` repository for integration tests")
)
const (
initialSHA = "1a0b36b3cdad1d2ee32457c102a8c0b7056fa863"
headSHA = "b83d6e391c22777fca1ed3012fce84f633d7fed0"
)
func checkDeps(t *testing.T) {
if _, err := os.Stat(*testRepo); err != nil {
t.Log("No test repo found at ", *testRepo)
t.SkipNow()
}
}
func runEachCommit(repo git.Repository) (map[string]*git.Commit, []string, error) {
commits := make(map[string]*git.Commit)
commitHashes := []string{}
err := repo.EachCommit(func(commit *git.Commit) error {
commits[commit.Hash] = commit
commitHashes = append(commitHashes, commit.Hash)
return nil
})
return commits, commitHashes, err
}
func TestEachCommit(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "", headSHA)
assert.NoError(t, err)
commits, commitHashes, err := runEachCommit(repo)
assert.NoError(t, err)
expectedCommits := []string{
"b83d6e391c22777fca1ed3012fce84f633d7fed0",
"498214de67004b1da3d820901307bed2a68a8ef6",
"1b12f15a11fc6e62177bef08f47bc7b5ce50b141",
"38008cb17ce1466d8fec2dfa6f6ab8dcfe5cf49e",
"6907208d755b60ebeacb2e9dfea74c92c3449a1f",
"c347ca2e140aa667b968e51ed0ffe055501fe4f4",
"d59c60028b053793cecfb4022de34602e1a9218e",
"281d3a76f31c812dbf48abce82ccf6860adedd81",
"a5391128b0ef5d21df5dd23d98557f4ef12fae20",
"54fcc214b94e78d7a41a9a8fe6d87a5e59500e51",
"be93687618e4b132087f430a4d8fc3a609c9b77c",
"048721d90c449b244b7b4c53a9186b04330174ec",
"5f923865dde3436854e9ceb9cdb7815618d4e849",
"d2d430676773caa88cdaf7c55944073b2fd5561a",
"2ea1f3dec713d940208fb5ce4a38765ecb5d3f73",
"59e29889be61e6e0e5e223bfa9ac2721d31605b8",
"66eceea0db202bb39c4e445e8ca28689645366c5",
"08f22f255f082689c0d7d39d19205085311542bc",
"19e2e9b4ef76b422ce1154af39a91323ccc57434",
"c642fe9b8b9f28f9225d7ea953fe14e74748d53b",
"9a944d90955aaf45f6d0c88f30e27f8d2c41cec0",
"c7fbe50c7c7419d9701eebe64b1fdacc3df5b9dd",
"e56497bb5f03a90a51293fc6d516788730953899",
"4cd80ccab63c82b4bad16faa5193fbd2aa06df40",
"5937ac0a7beb003549fc5fd26fc247adbce4a52e",
"570e7b2abdd848b95f2f578043fc23bd6f6fd24d",
"6f6d7e7ed97bb5f0054f2b1df789b39ca89b6ff9",
"d14d6c0abdd253381df51a723d58691b2ee1ab08",
"c1acaa58bbcbc3eafe538cb8274ba387047b69f8",
"ae73cb07c9eeaf35924a10f713b364d32b2dd34f",
"874797c3a73b60d2187ed6e2fcabd289ff75171e",
"2f63565e7aac07bcdadb654e253078b727143ec4",
"33f3729a45c02fc67d00adb1b8bca394b0e761d9",
"913c66a37b4a45b9769037c55c2d238bd0942d2e",
"cfe32cf61b73a0d5e9f13e774abde7ff789b1660",
"6d394385cf567f80a8fd85055db1ab4c5295806f",
"1a0b36b3cdad1d2ee32457c102a8c0b7056fa863",
}
// We don't mind the order these are given in
sort.Strings(expectedCommits)
sort.Strings(commitHashes)
assert.Equal(t, expectedCommits, commitHashes)
// Now choose one commit and check it in detail
commit := commits[initialSHA]
date, err := time.Parse("Mon Jan 02 15:04:05 2006 -0700", "Thu Feb 27 00:03:18 2014 -0800")
assert.NoError(t, err)
dmitriy := git.Signature{
Name: "Dmitriy Zaporozhets",
Email: "dmitriy.zaporozhets@gmail.com",
When: date,
}
assert.Equal(t, initialSHA, commit.Hash)
assert.Equal(t, "Initial commit\n", commit.Message)
assert.Equal(t, dmitriy, commit.Author)
assert.Equal(t, dmitriy, commit.Author)
}
func TestEachCommitGivenRangeOf3Commits(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "1b12f15a11fc6e62177bef08f47bc7b5ce50b141", headSHA)
assert.NoError(t, err)
_, commitHashes, err := runEachCommit(repo)
assert.NoError(t, err)
expected := []string{"498214de67004b1da3d820901307bed2a68a8ef6", headSHA}
sort.Strings(expected)
sort.Strings(commitHashes)
assert.Equal(t, expected, commitHashes)
}
func TestEachCommitGivenRangeOf2Commits(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "498214de67004b1da3d820901307bed2a68a8ef6", headSHA)
assert.NoError(t, err)
_, commitHashes, err := runEachCommit(repo)
assert.NoError(t, err)
assert.Equal(t, []string{headSHA}, commitHashes)
}
func TestEachCommitGivenRangeOf1Commit(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, headSHA, headSHA)
assert.NoError(t, err)
_, commitHashes, err := runEachCommit(repo)
assert.NoError(t, err)
assert.Equal(t, []string{}, commitHashes)
}
func TestEmptyToSHADefaultsToHeadSHA(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "498214de67004b1da3d820901307bed2a68a8ef6", "")
assert.NoError(t, err)
_, commitHashes, err := runEachCommit(repo)
assert.NoError(t, err)
assert.Equal(t, []string{headSHA}, commitHashes)
}
// TODO: store the ins/mod/del status of each change
func runEachFileChange(repo git.Repository) (map[string]*git.File, []string, error) {
files := make(map[string]*git.File)
filePaths := []string{}
store := func(f *git.File, _, _ string) error {
files[f.Path] = f
filePaths = append(filePaths, f.Path)
return nil
}
err := repo.EachFileChange(store, store, store)
return files, filePaths, err
}
func TestEachFileChangeMod(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "", headSHA)
assert.NoError(t, err)
files, filePaths, err := runEachFileChange(repo)
assert.NoError(t, err)
expectedFiles := []string{
".gitattributes",
".gitignore",
".gitmodules",
"CHANGELOG",
"CONTRIBUTING.md",
"Gemfile.zip",
"LICENSE",
"MAINTENANCE.md",
"PROCESS.md",
"README",
"README.md",
"VERSION",
"bar/branch-test.txt",
"custom-highlighting/test.gitlab-custom",
"encoding/feature-1.txt",
"encoding/feature-2.txt",
"encoding/hotfix-1.txt",
"encoding/hotfix-2.txt",
"encoding/iso8859.txt",
"encoding/russian.rb",
"encoding/test.txt",
"encoding/テスト.txt",
"encoding/テスト.xls",
"files/html/500.html",
"files/images/6049019_460s.jpg",
"files/images/logo-black.png",
"files/images/logo-white.png",
"files/images/wm.svg",
"files/js/application.js",
"files/js/commit.coffee",
"files/lfs/lfs_object.iso",
"files/markdown/ruby-style-guide.md",
"files/ruby/popen.rb",
"files/ruby/regex.rb",
"files/ruby/version_info.rb",
"files/whitespace",
"foo/bar/.gitkeep",
"with space/README.md",
}
// We don't mind the order these are given in
sort.Strings(expectedFiles)
sort.Strings(filePaths)
assert.Equal(t, expectedFiles, filePaths)
// Now choose one file and check it in detail
file := files["VERSION"]
blob, err := file.Blob()
assert.NoError(t, err)
data, err := ioutil.ReadAll(blob)
assert.NoError(t, err)
assert.Equal(t, "VERSION", file.Path)
assert.Equal(t, "998707b421c89bd9a3063333f9f728ef3e43d101", file.Oid)
assert.Equal(t, int64(10), file.Size)
assert.Equal(t, "6.7.0.pre\n", string(data))
}
func TestEachFileChangeGivenRangeOfThreeCommits(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "1b12f15a11fc6e62177bef08f47bc7b5ce50b141", headSHA)
assert.NoError(t, err)
_, filePaths, err := runEachFileChange(repo)
assert.Equal(t, []string{"bar/branch-test.txt"}, filePaths)
}
func TestEachFileChangeGivenRangeOfTwoCommits(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "498214de67004b1da3d820901307bed2a68a8ef6", headSHA)
assert.NoError(t, err)
_, filePaths, err := runEachFileChange(repo)
assert.Equal(t, []string{}, filePaths)
}
package indexer
import (
"bytes"
"fmt"
"io/ioutil"
"gitlab.com/gitlab-org/es-git-go/git"
"gitlab.com/gitlab-org/es-git-go/linguist"
)
var (
SkipTooLargeBlob = fmt.Errorf("Blob should be skipped: Too large")
SkipBinaryBlob = fmt.Errorf("Blob should be skipped: binary")
)
const (
binarySearchLimit = 8 * 1024 // 8 KiB, Same as git
maxBlobSize = 1024 * 1024 // 1MiB, same as gitlab-elasticsearch-git
)
func isSkipBlobErr(err error) bool {
switch err {
case SkipTooLargeBlob:
return true
case SkipBinaryBlob:
return true
}
return false
}
type Blob struct {
Type string `json:"type"`
ID string `json:"-"`
OID string `json:"oid"`
RepoID string `json:"rid"`
CommitSHA string `json:"commit_sha"`
Content string `json:"content"`
Path string `json:"path"`
// Message copied from gitlab-elasticsearch-git:
//
// We're duplicating file_name parameter here because we need another
// analyzer for it.
//
//Ideally this should be done with copy_to: 'blob.file_name' option
//but it does not work in ES v2.3.*. We're doing it so to not make users
//install newest versions
//
//https://github.com/elastic/elasticsearch-mapper-attachments/issues/124
Filename string `json:"file_name"`
Language string `json:"language"`
}
func GenerateBlobID(parentID, filename string) string {
return fmt.Sprintf("%s_%s", parentID, filename)
}
func BuildBlob(file *git.File, parentID, commitSHA string) (*Blob, error) {
if file.Size > maxBlobSize {
return nil, SkipTooLargeBlob
}
reader, err := file.Blob()
if err != nil {
return nil, err
}
defer reader.Close()
// FIXME(nick): This doesn't look cheap. Check the RAM & CPU pressure, esp.
// for large blobs
b, err := ioutil.ReadAll(reader)
if err != nil {
return nil, err
}
if DetectBinary(b) {
return nil, SkipBinaryBlob
}
content := tryEncodeBytes(b)
filename := tryEncodeString(file.Path)
return &Blob{
Type: "blob",
ID: GenerateBlobID(parentID, filename),
OID: file.Oid,
RepoID: parentID,
CommitSHA: commitSHA,
Content: content,
Path: filename,
Filename: filename,
Language: DetectLanguage(filename, b),
}, nil
}
// DetectLanguage returns a string describing the language of the file. This is
// programming language, rather than natural language.
//
// If no language is detected, "Text" is returned.
func DetectLanguage(filename string, data []byte) string {
lang := linguist.DetectLanguage(filename, data)
if lang != nil {
return lang.Name
}
return "Text"
}
// DetectBinary checks whether the passed-in data contains a NUL byte. Only scan
// the start of large blobs. This is the same test performed by git to check
// text/binary
func DetectBinary(data []byte) bool {
searchLimit := binarySearchLimit
if len(data) < searchLimit {
searchLimit = len(data)
}
return bytes.Contains(data[:searchLimit], []byte{0})
}
package indexer_test
import (
"encoding/json"
"testing"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/indexer"
)
func TestBuildBlob(t *testing.T) {
file := gitFile("foo/bar", "foo")
expected := validBlob(file, "foo", "Text")
actual, err := indexer.BuildBlob(file, expected.RepoID, expected.CommitSHA)
assert.NoError(t, err)
assert.Equal(t, expected, actual)
expectedJSON := `{
"commit_sha" : "` + expected.CommitSHA + `",
"content" : "` + expected.Content + `",
"file_name" : "` + expected.Filename + `",
"language" : "` + expected.Language + `",
"oid" : "` + expected.OID + `",
"path" : "` + expected.Path + `",
"rid" : "` + expected.RepoID + `",
"type" : "blob"
}`
actualJSON, err := json.Marshal(actual)
assert.NoError(t, err)
assert.JSONEq(t, expectedJSON, string(actualJSON))
}
func TestBuildBlobSkipsLargeBlobs(t *testing.T) {
file := gitFile("foo/bar", "foo")
file.Size = 1024*1024 + 1
blob, err := indexer.BuildBlob(file, parentID, sha)
assert.Error(t, err, indexer.SkipTooLargeBlob)
assert.Nil(t, blob)
}
func TestBuildBlobSkipsBinaryBlobs(t *testing.T) {
file := gitFile("foo/bar", "foo\x00")
blob, err := indexer.BuildBlob(file, parentID, sha)
assert.Equal(t, err, indexer.SkipBinaryBlob)
assert.Nil(t, blob)
}
func TestBuildBlobDetectsLanguageByFilename(t *testing.T) {
file := gitFile("Makefile.am", "foo")
blob, err := indexer.BuildBlob(file, parentID, sha)
assert.NoError(t, err)
assert.Equal(t, "Makefile", blob.Language)
}
func TestBuildBlobDetectsLanguageByExtension(t *testing.T) {
file := gitFile("foo.rb", "foo")
blob, err := indexer.BuildBlob(file, parentID, sha)
assert.NoError(t, err)
assert.Equal(t, "Ruby", blob.Language)
}
func TestGenerateBlobID(t *testing.T) {
assert.Equal(t, "projectID_path", indexer.GenerateBlobID("projectID", "path"))
}
package indexer
import (
"fmt"
"gitlab.com/gitlab-org/es-git-go/git"
)
type Commit struct {
Type string `json:"type"`
ID string `json:"-"`
Author *Person `json:"author"`
Committer *Person `json:"committer"`
RepoID string `json:"rid"`
Message string `json:"message"`
SHA string `json:"sha"`
}
func GenerateCommitID(parentID, commitSHA string) string {
return fmt.Sprintf("%s_%s", parentID, commitSHA)
}
func BuildCommit(c *git.Commit, parentID string) *Commit {
sha := c.Hash
return &Commit{
Type: "commit",
Author: BuildPerson(c.Author),
Committer: BuildPerson(c.Committer),
ID: GenerateCommitID(parentID, sha),
RepoID: parentID,
Message: tryEncodeString(c.Message),
SHA: sha,
}
}
package indexer_test
import (
"encoding/json"
"testing"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/indexer"
)
func TestBuildCommit(t *testing.T) {
gitCommit := gitCommit("Initial commit")
expected := validCommit(gitCommit)
actual := indexer.BuildCommit(gitCommit, expected.RepoID)
assert.Equal(t, expected, actual)
expectedJSON := `{
"sha" : "` + expected.SHA + `",
"message" : "` + expected.Message + `",
"author" : {
"name": "` + expected.Author.Name + `",
"email": "` + expected.Author.Email + `",
"time": "` + indexer.GenerateDate(gitCommit.Author.When) + `"
},
"committer" : {
"name": "` + expected.Committer.Name + `",
"email": "` + expected.Committer.Email + `",
"time": "` + indexer.GenerateDate(gitCommit.Committer.When) + `"
},
"rid" : "` + expected.RepoID + `",
"type" : "commit"
}`
actualJSON, err := json.Marshal(actual)
assert.NoError(t, err)
assert.JSONEq(t, expectedJSON, string(actualJSON))
}
func TestGenerateCommitID(t *testing.T) {
assert.Equal(t, "projectID_sha", indexer.GenerateCommitID("projectID", "sha"))
}
package indexer
import (
"fmt"
"log"
"github.com/lupine/icu"
)
var (
detector *icu.CharsetDetector
converter = icu.NewCharsetConverter(maxBlobSize)
)
func init() {
var err error
detector, err = icu.NewCharsetDetector()
if err != nil {
panic(err)
}
}
func tryEncodeString(s string) string {
encoded, err := encodeString(s)
if err != nil {
log.Println(err)
return s // TODO: Run it through the UTF-8 replacement encoder
}
return encoded
}
func tryEncodeBytes(b []byte) string {
encoded, err := encodeBytes(b)
if err != nil {
log.Println(err)
s := string(b)
return s // TODO: Run it through the UTF-8 replacement encoder
}
return encoded
}
func encodeString(s string) (string, error) {
return encodeBytes([]byte(s))
}
// encodeString converts a string from an arbitrary encoding to UTF-8
func encodeBytes(b []byte) (string, error) {
if len(b) == 0 {
return "", nil
}
matches, err := detector.GuessCharset(b)
if err != nil {
return "", fmt.Errorf("Couldn't guess charset: %s", err)
}
// Try encoding for each match, returning the first that succeeds
for _, match := range matches {
utf8, err := converter.ConvertToUtf8(b, match.Charset)
if err == nil {
return string(utf8), nil
}
}
return "", fmt.Errorf("Failed to convert from %s to UTF-8", matches[0].Charset)
}
package indexer
import (
"fmt"
"log"
"gitlab.com/gitlab-org/es-git-go/git"
)
type Submitter interface {
ParentID() string
Index(id string, thing interface{})
Remove(id string)
Flush() error
}
type Indexer struct {
git.Repository
Submitter
}
func (i *Indexer) SubmitCommit(c *git.Commit) error {
commit := BuildCommit(c, i.Submitter.ParentID())
i.Submitter.Index(commit.ID, map[string]interface{}{"commit": commit})
return nil
}
func (i *Indexer) SubmitBlob(f *git.File, _, toCommit string) error {
blob, err := BuildBlob(f, i.Submitter.ParentID(), toCommit)
if err != nil {
if isSkipBlobErr(err) {
return nil
}
return fmt.Errorf("Blob %s: %s", f.Path, err)
}
i.Submitter.Index(blob.ID, map[string]interface{}{"blob": blob})
return nil
}
func (i *Indexer) RemoveBlob(file *git.File, _, _ string) error {
blobID := GenerateBlobID(i.Submitter.ParentID(), file.Path)
i.Submitter.Remove(blobID)
return nil
}
func (i *Indexer) IndexCommits() error {
return i.Repository.EachCommit(i.SubmitCommit)
}
func (i *Indexer) IndexBlobs() error {
return i.Repository.EachFileChange(i.SubmitBlob, i.SubmitBlob, i.RemoveBlob)
}
func (i *Indexer) Index() error {
if err := i.IndexBlobs(); err != nil {
log.Print("Error while indexing blobs: ", err)
return err
}
if err := i.IndexCommits(); err != nil {
log.Print("Error while indexing commits: ", err)
return err
}
if err := i.Submitter.Flush(); err != nil {
log.Print("Error while flushing requests: ", err)
}
return nil
}
package indexer_test
import (
"fmt"
"io"
"io/ioutil"
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/git"
"gitlab.com/gitlab-org/es-git-go/indexer"
)
const (
sha = "9876543210987654321098765432109876543210"
oid = "0123456789012345678901234567890123456789"
parentID = "667"
)
type fakeSubmitter struct {
flushed int
indexed int
indexedID []string
indexedThing []interface{}
removed int
removedID []string
}
type fakeRepository struct {
commits []*git.Commit
added []*git.File
modified []*git.File
removed []*git.File
}
func (f *fakeSubmitter) ParentID() string {
return parentID
}
func (f *fakeSubmitter) Index(id string, thing interface{}) {
f.indexed++
f.indexedID = append(f.indexedID, id)
f.indexedThing = append(f.indexedThing, thing)
}
func (f *fakeSubmitter) Remove(id string) {
f.removed++
f.removedID = append(f.removedID, id)
}
func (f *fakeSubmitter) Flush() error {
f.flushed++
return nil
}
func (r *fakeRepository) EachFileChange(ins, mod, del git.FileFunc) error {
for _, file := range r.added {
if err := ins(file, sha, sha); err != nil {
return err
}
}
for _, file := range r.modified {
if err := mod(file, sha, sha); err != nil {
return err
}
}
for _, file := range r.removed {
if err := del(file, sha, sha); err != nil {
return err
}
}
return nil
}
func (r *fakeRepository) EachCommit(f git.CommitFunc) error {
for _, commit := range r.commits {
if err := f(commit); err != nil {
return err
}
}
return nil
}
func setupIndexer() (*indexer.Indexer, *fakeRepository, *fakeSubmitter) {
repo := &fakeRepository{}
submitter := &fakeSubmitter{}
return &indexer.Indexer{
Repository: repo,
Submitter: submitter,
}, repo, submitter
}
func readerFunc(data string, err error) func() (io.ReadCloser, error) {
return func() (io.ReadCloser, error) {
return ioutil.NopCloser(strings.NewReader(data)), err
}
}
func gitFile(path, content string) *git.File {
return &git.File{
Path: path,
Blob: readerFunc(content, nil),
Size: int64(len(content)),
Oid: oid,
}
}
func gitCommit(message string) *git.Commit {
return &git.Commit{
Author: git.Signature{
Email: "job@gitlab.com",
Name: "Job van der Voort",
When: time.Date(2016, time.September, 27, 14, 37, 46, 0, time.UTC),
},
Committer: git.Signature{
Email: "nick@gitlab.com",
Name: "Nick Thomas",
When: time.Date(2017, time.October, 28, 15, 38, 47, 1, time.UTC),
},
Message: message,
Hash: sha,
}
}
func validBlob(file *git.File, content, language string) *indexer.Blob {
return &indexer.Blob{
Type: "blob",
ID: indexer.GenerateBlobID(parentID, file.Path),
OID: oid,
RepoID: parentID,
CommitSHA: sha,
Content: content,
Path: file.Path,
Filename: file.Path,
Language: language,
}
}
func validCommit(gitCommit *git.Commit) *indexer.Commit {
return &indexer.Commit{
Type: "commit",
ID: indexer.GenerateCommitID(parentID, gitCommit.Hash),
Author: indexer.BuildPerson(gitCommit.Author),
Committer: indexer.BuildPerson(gitCommit.Committer),
RepoID: parentID,
Message: gitCommit.Message,
SHA: sha,
}
}
func TestIndex(t *testing.T) {
idx, repo, submit := setupIndexer()
gitCommit := gitCommit("Initial commit")
gitAdded := gitFile("foo/bar", "added file")
gitModified := gitFile("foo/baz", "modified file")
gitRemoved := gitFile("foo/qux", "removed file")
gitTooBig := gitFile("invalid/too-big", "")
gitTooBig.Size = int64(1024*1024 + 1)
gitBinary := gitFile("invalid/binary", "foo\x00")
commit := validCommit(gitCommit)
added := validBlob(gitAdded, "added file", "Text")
modified := validBlob(gitModified, "modified file", "Text")
removed := validBlob(gitRemoved, "removed file", "Text")
repo.commits = append(repo.commits, gitCommit)
repo.added = append(repo.added, gitAdded, gitTooBig, gitBinary)
repo.modified = append(repo.modified, gitModified)
repo.removed = append(repo.removed, gitRemoved)
idx.Index()
assert.Equal(t, submit.indexed, 3)
assert.Equal(t, submit.removed, 1)
assert.Equal(t, parentID+"_"+added.Path, submit.indexedID[0])
assert.Equal(t, map[string]interface{}{"blob": added}, submit.indexedThing[0])
assert.Equal(t, parentID+"_"+modified.Path, submit.indexedID[1])
assert.Equal(t, map[string]interface{}{"blob": modified}, submit.indexedThing[1])
assert.Equal(t, parentID+"_"+commit.SHA, submit.indexedID[2])
assert.Equal(t, map[string]interface{}{"commit": commit}, submit.indexedThing[2])
assert.Equal(t, parentID+"_"+removed.Path, submit.removedID[0])
assert.Equal(t, submit.flushed, 1)
}
func TestErrorIndexingSkipsRemainder(t *testing.T) {
idx, repo, submit := setupIndexer()
gitOKFile := gitFile("ok", "")
gitBreakingFile := gitFile("broken", "")
gitBreakingFile.Blob = readerFunc("", fmt.Errorf("Error"))
repo.added = append(repo.added, gitBreakingFile, gitOKFile)
err := idx.Index()
assert.Error(t, err)
assert.Equal(t, submit.indexed, 0)
assert.Equal(t, submit.removed, 0)
assert.Equal(t, submit.flushed, 0)
}
package indexer
import (
"time"
"gitlab.com/gitlab-org/es-git-go/git"
)
const (
elasticTimeFormat = "20060102T150405-0700"
)
type Person struct {
Name string `json:"name"`
Email string `json:"email"`
Time string `json:"time"` // %Y%m%dT%H%M%S%z
}
func GenerateDate(t time.Time) string {
return t.Format(elasticTimeFormat)
}
func BuildPerson(p git.Signature) *Person {
return &Person{
Name: tryEncodeString(p.Name),
Email: tryEncodeString(p.Email),
Time: GenerateDate(p.When),
}
}
package main_test
import (
"encoding/json"
"flag"
"fmt"
"os"
"os/exec"
"testing"
"time"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/elastic"
"gitlab.com/gitlab-org/es-git-go/indexer"
)
var (
binary = flag.String("binary", "./bin/es-git-go", "Path to `es-git-go` binary for integration tests")
testRepo = flag.String("test-repo", "./tmp/gitlab-test.git", "Path to `gitlab-test` repository for integration tests")
)
const (
projectID = "667"
headSHA = "b83d6e391c22777fca1ed3012fce84f633d7fed0"
)
func checkDeps(t *testing.T) {
if os.Getenv("ELASTIC_CONNECTION_INFO") == "" {
t.Log("ELASTIC_CONNECTION_INFO not set")
t.Skip()
}
if testing.Short() {
t.Log("Test run with -short, skipping integration test")
t.Skip()
}
if _, err := os.Stat(*binary); err != nil {
t.Log("No binary found at ", *binary)
t.Skip()
}
if _, err := os.Stat(*testRepo); err != nil {
t.Log("No test repo found at ", *testRepo)
t.Skip()
}
}
func buildIndex(t *testing.T) (*elastic.Client, func()) {
railsEnv := fmt.Sprintf("test-integration-%d", time.Now().Unix())
os.Setenv("RAILS_ENV", railsEnv)
client, err := elastic.FromEnv(projectID)
assert.NoError(t, err)
assert.NoError(t, client.CreateIndex())
return client, func() {
client.DeleteIndex()
}
}
func run(from, to string) error {
cmd := exec.Command(*binary, projectID, *testRepo)
cmd.Env = os.Environ()
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if from != "" {
cmd.Env = append(cmd.Env, "FROM_SHA="+from)
}
if to != "" {
cmd.Env = append(cmd.Env, "TO_SHA="+to)
}
return cmd.Run()
}
func TestIndexingRemovesFiles(t *testing.T) {
checkDeps(t)
c, td := buildIndex(t)
defer td()
// The commit before files/empty is removed - so it should be indexed
assert.NoError(t, run("", "19e2e9b4ef76b422ce1154af39a91323ccc57434"))
_, err := c.GetBlob("files/empty")
assert.NoError(t, err)
// Now we expect it to have been removed
assert.NoError(t, run("19e2e9b4ef76b422ce1154af39a91323ccc57434", "08f22f255f082689c0d7d39d19205085311542bc"))
_, err = c.GetBlob("files/empty")
assert.Error(t, err)
}
// Go source is defined to be UTF-8 encoded, so literals here are UTF-8
func TestIndexingTranscodesToUTF8(t *testing.T) {
checkDeps(t)
c, td := buildIndex(t)
defer td()
assert.NoError(t, run("", headSHA))
for _, tc := range []struct{
path string
expected string
} {
{"encoding/iso8859.txt", "狞\n"}, // GB18030
{"encoding/test.txt", "これはテストです。\nこれもマージして下さい。\n\nAdd excel file.\nDelete excel file."}, // SHIFT_JIS
} {
blob, err := c.GetBlob(tc.path)
assert.NoError(t, err)
blobDoc := make(map[string]*indexer.Blob)
assert.NoError(t, json.Unmarshal(*blob.Source, &blobDoc))
assert.Equal(t, tc.expected, blobDoc["blob"].Content)
}
}
func TestIndexingGitlabTest(t *testing.T) {
checkDeps(t)
c, td := buildIndex(t)
defer td()
assert.NoError(t, run("", headSHA))
// Check the indexing of a commit
commit, err := c.GetCommit(headSHA)
assert.NoError(t, err)
assert.True(t, commit.Found)
assert.Equal(t, "repository", commit.Type)
assert.Equal(t, projectID+"_"+headSHA, commit.Id)
assert.Equal(t, projectID, commit.Routing)
assert.Equal(t, projectID, commit.Parent)
doc := make(map[string]map[string]interface{})
assert.NoError(t, json.Unmarshal(*commit.Source, &doc))
commitDoc, ok := doc["commit"]
assert.True(t, ok)
assert.Equal(
t,
map[string]interface{}{
"type": "commit",
"sha": headSHA,
"author": map[string]interface{}{
"email": "job@gitlab.com",
"name": "Job van der Voort",
"time": "20160927T143746+0000",
},
"committer": map[string]interface{}{
"email": "job@gitlab.com",
"name": "Job van der Voort",
"time": "20160927T143746+0000",
},
"rid": projectID,
"message": "Merge branch 'branch-merged' into 'master'\r\n\r\nadds bar folder and branch-test text file to check Repository merged_to_root_ref method\r\n\r\n\r\n\r\nSee merge request !12",
},
commitDoc,
)
// Check the indexing of a text blob
blob, err := c.GetBlob("README.md")
assert.NoError(t, err)
assert.True(t, blob.Found)
assert.Equal(t, "repository", blob.Type)
assert.Equal(t, projectID+"_README.md", blob.Id)
assert.Equal(t, projectID, blob.Routing)
assert.Equal(t, projectID, blob.Parent)
doc = make(map[string]map[string]interface{})
assert.NoError(t, json.Unmarshal(*blob.Source, &doc))
blobDoc, ok := doc["blob"]
assert.True(t, ok)
assert.Equal(
t,
map[string]interface{}{
"type": "blob",
"language": "Markdown",
"path": "README.md",
"file_name": "README.md",
"oid": "faaf198af3a36dbf41961466703cc1d47c61d051",
"rid": projectID,
"commit_sha": headSHA,
"content": "testme\n======\n\nSample repo for testing gitlab features\n",
},
blobDoc,
)
// Check that a binary blob isn't indexed
_, err = c.GetBlob("Gemfile.zip")
assert.Error(t, err)
// Test that timezones are preserved
commit, err = c.GetCommit("498214de67004b1da3d820901307bed2a68a8ef6")
assert.NoError(t, err)
cDoc := make(map[string]*indexer.Commit)
assert.NoError(t, json.Unmarshal(*commit.Source, &cDoc))
assert.Equal(t, "20160921T161326+0100", cDoc["commit"].Author.Time)
assert.Equal(t, "20160921T161326+0100", cDoc["commit"].Committer.Time)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment