Skip to content
Snippets Groups Projects
Commit e9fd598f authored by Nick Thomas's avatar Nick Thomas
Browse files

Initial implementation of an elasticsearch indexer in Go

parent be84ab1b
No related branches found
No related tags found
No related merge requests found
/es-git-go
/.GOPATH
/bin
/tmp
/vendor/github.com/stretchr/testify
/vendor/github.com/wadey/gocovmerge
/vendor/golang.org/x/tools
/vendor/manifest
.test: &test
services:
- elasticsearch:5.1
variables:
V: "1"
ELASTIC_CONNECTION_INFO: '{"url":["http://elasticsearch:9200"]}'
stage: test
script:
- apt-get update && apt-get -yy install libicu-dev
- make setup
- make format
- make cover
- make
- make test
test 1.7:
<<: *test
image: golang:1.7
test 1.8:
<<: *test
image: golang:1.8
Makefile 0 → 100644
# The import path is where your repository can be found.
# To import subpackages, always prepend the full import path.
# If you change this, run `make clean`. Read more: https://git.io/vM7zV
IMPORT_PATH := gitlab.com/gitlab-org/es-git-go
GO15VENDOREXPERIMENT := 1
# V := 1 # When V is set, print commands and build progress.
# Space separated patterns of packages to skip in list, test, format.
IGNORED_PACKAGES := /vendor/
.PHONY: all
all: build
.PHONY: build
build: .GOPATH/.ok
$Q go install $(if $V,-v) $(VERSION_FLAGS) $(IMPORT_PATH)
### Code not in the repository root? Another binary? Add to the path like this.
# .PHONY: otherbin
# otherbin: .GOPATH/.ok
# $Q go install $(if $V,-v) $(VERSION_FLAGS) $(IMPORT_PATH)/cmd/otherbin
##### ^^^^^^ EDIT ABOVE ^^^^^^ #####
##### =====> Utility targets <===== #####
.PHONY: clean test list cover format
clean:
$Q rm -rf bin .GOPATH tmp
test: .GOPATH/.ok
$Q go test $(if $V,-v) -i -race $(allpackages) # install -race libs to speed up next run
ifndef CI
$Q go vet $(allpackages)
$Q GODEBUG=cgocheck=2 go test $(if $V,-v) -race $(allpackages)
else
$Q ( go vet $(allpackages); echo $$? ) | \
tee .GOPATH/test/vet.txt | sed '$$ d'; exit $$(tail -1 .GOPATH/test/vet.txt)
$Q ( GODEBUG=cgocheck=2 go test -v -race $(allpackages); echo $$? ) | \
tee .GOPATH/test/output.txt | sed '$$ d'; exit $$(tail -1 .GOPATH/test/output.txt)
endif
list: .GOPATH/.ok
@echo $(allpackages)
cover: bin/gocovmerge .GOPATH/.ok
@echo "NOTE: make cover does not exit 1 on failure, don't use it to check for tests success!"
$Q rm -f .GOPATH/cover/*.out .GOPATH/cover/all.merged
$(if $V,@echo "-- go test -coverpkg=./... -coverprofile=.GOPATH/cover/... ./...")
@for MOD in $(allpackages); do \
go test -coverpkg=`echo $(allpackages)|tr " " ","` \
-coverprofile=.GOPATH/cover/unit-`echo $$MOD|tr "/" "_"`.out \
$$MOD 2>&1 | grep -v "no packages being tested depend on"; \
done
$Q ./bin/gocovmerge .GOPATH/cover/*.out > .GOPATH/cover/all.merged
ifndef CI
$Q go tool cover -html .GOPATH/cover/all.merged
else
$Q go tool cover -html .GOPATH/cover/all.merged -o .GOPATH/cover/all.html
endif
@echo ""
@echo "=====> Total test coverage: <====="
@echo ""
$Q go tool cover -func .GOPATH/cover/all.merged
format: bin/goimports .GOPATH/.ok
$Q find .GOPATH/src/$(IMPORT_PATH)/ -iname \*.go | grep -v \
-e "^$$" $(addprefix -e ,$(IGNORED_PACKAGES)) | xargs ./bin/goimports -w
##### =====> Internals <===== #####
.PHONY: setup
setup: clean .GOPATH/.ok
@if ! grep "/.GOPATH" .gitignore > /dev/null 2>&1; then \
echo "/.GOPATH" >> .gitignore; \
echo "/bin" >> .gitignore; \
fi
go get -u github.com/FiloSottile/gvt
- ./bin/gvt fetch golang.org/x/tools/cmd/goimports
- ./bin/gvt fetch github.com/wadey/gocovmerge
- ./bin/gvt fetch github.com/stretchr/testify/assert
- mkdir tmp
- git clone --bare https://gitlab.com/gitlab-org/gitlab-test.git tmp/gitlab-test.git
VERSION := $(shell git describe --tags --always --dirty="-dev")
DATE := $(shell date -u '+%Y-%m-%d-%H%M UTC')
VERSION_FLAGS := -ldflags='-X "main.Version=$(VERSION)" -X "main.BuildTime=$(DATE)"'
# cd into the GOPATH to workaround ./... not following symlinks
_allpackages = $(shell ( cd $(CURDIR)/.GOPATH/src/$(IMPORT_PATH) && \
GOPATH=$(CURDIR)/.GOPATH go list ./... 2>&1 1>&3 | \
grep -v -e "^$$" $(addprefix -e ,$(IGNORED_PACKAGES)) 1>&2 ) 3>&1 | \
grep -v -e "^$$" $(addprefix -e ,$(IGNORED_PACKAGES)))
# memoize allpackages, so that it's executed only once and only if used
allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages)
export GOPATH := $(CURDIR)/.GOPATH
unexport GOBIN
Q := $(if $V,,@)
.GOPATH/.ok:
$Q mkdir -p "$(dir .GOPATH/src/$(IMPORT_PATH))"
$Q ln -s ../../../.. ".GOPATH/src/$(IMPORT_PATH)"
$Q mkdir -p .GOPATH/test .GOPATH/cover
$Q mkdir -p bin
$Q ln -s ../bin .GOPATH/bin
$Q touch $@
.PHONY: bin/gocovmerge bin/goimports
bin/gocovmerge: .GOPATH/.ok
@test -d ./vendor/github.com/wadey/gocovmerge || \
{ echo "Vendored gocovmerge not found, try running 'make setup'..."; exit 1; }
$Q go install $(IMPORT_PATH)/vendor/github.com/wadey/gocovmerge
bin/goimports: .GOPATH/.ok
@test -d ./vendor/golang.org/x/tools/cmd/goimports || \
{ echo "Vendored goimports not found, try running 'make setup'..."; exit 1; }
$Q go install $(IMPORT_PATH)/vendor/golang.org/x/tools/cmd/goimports
# Based on https://github.com/cloudflare/hellogopher - v1.1 - MIT License
#
# Copyright (c) 2017 Cloudflare
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# GitLab Elasticsearch Indexer
This project indexes Git repositories into Elasticsearch for GitLab. See the
[homepage](https://gitlab.com/gitlab-org/es-git-go) for more information.
## Building
This project relies on [ICU](http://site.icu-project.org/) for text encoding;
ensure the development packages for your platform are installed before running
`make`:
### Debian / Ubuntu
```
# apt install libicu-dev
```
### Mac OSX
```
$ brew install icu4c
$ export PKG_CONFIG_PATH="/usr/local/opt/icu4c/lib/pkgconfig:$PKG_CONFIG_PATH"
```
package elastic
import (
"context"
"fmt"
"net/http"
"os"
"strings"
"github.com/aws/aws-sdk-go/aws/credentials"
"github.com/aws/aws-sdk-go/aws/signer/v4"
"github.com/deoxxa/aws_signing_client"
"gopkg.in/olivere/elastic.v5"
)
var (
timeoutError = fmt.Errorf("Timeout")
)
const (
// TODO: make this configurable / detectable.
// Limiting to 10MiB lets us work on small AWS clusters, but unnecessarily
// increases round trips in larger or non-AWS clusters
MaxBulkSize = 10 * 1024 * 1024
BulkWorkers = 10
)
type Client struct {
IndexName string
ProjectID string
Client *elastic.Client
bulk *elastic.BulkProcessor
}
// FromEnv creates an Elasticsearch client from the `ELASTIC_CONNECTION_INFO`
// environment variable
func FromEnv(projectID string) (*Client, error) {
data := strings.NewReader(os.Getenv("ELASTIC_CONNECTION_INFO"))
config, err := ReadConfig(data)
if err != nil {
return nil, fmt.Errorf("Couldn't parse ELASTIC_CONNECTION_INFO: %s", err)
}
railsEnv := os.Getenv("RAILS_ENV")
indexName := "gitlab"
if railsEnv != "" {
indexName = indexName + "-" + railsEnv
}
config.IndexName = indexName
config.ProjectID = projectID
return NewClient(config)
}
func NewClient(config *Config) (*Client, error) {
var opts []elastic.ClientOptionFunc
// AWS settings have to come first or they override custom URL, etc
if config.AWS {
credentials := credentials.NewStaticCredentials(config.AccessKey, config.SecretKey, "")
signer := v4.NewSigner(credentials)
awsClient, err := aws_signing_client.New(signer, &http.Client{}, "es", config.Region)
if err != nil {
return nil, err
}
opts = append(opts, elastic.SetHttpClient(awsClient))
}
// Sniffer should look for HTTPS URLs if at-least-one initial URL is HTTPS
for _, url := range config.URL {
if strings.HasPrefix(url, "https:") {
opts = append(opts, elastic.SetScheme("https"))
break
}
}
opts = append(opts, elastic.SetURL(config.URL...), elastic.SetSniff(false))
client, err := elastic.NewClient(opts...)
if err != nil {
return nil, err
}
bulk, err := client.BulkProcessor().
Workers(BulkWorkers).
BulkSize(MaxBulkSize).
Do(context.Background())
if err != nil {
return nil, err
}
return &Client{
IndexName: config.IndexName,
ProjectID: config.ProjectID,
Client: client,
bulk: bulk,
}, nil
}
func (c *Client) ParentID() string {
return c.ProjectID
}
func (c *Client) Flush() error {
return c.bulk.Flush()
}
func (c *Client) Close() {
c.Client.Stop()
}
func (c *Client) Index(id string, thing interface{}) {
req := elastic.NewBulkIndexRequest().
Index(c.IndexName).
Type("repository").
Parent(c.ProjectID).
Id(id).
Doc(thing)
c.bulk.Add(req)
}
// We only really use this for tests
func (c *Client) Get(id string) (*elastic.GetResult, error) {
return c.Client.Get().
Index(c.IndexName).
Type("repository").
Id(id).
Routing(c.ProjectID).
Do(context.TODO())
}
func (c *Client) GetCommit(id string) (*elastic.GetResult, error) {
return c.Get(c.ProjectID + "_" + id)
}
func (c *Client) GetBlob(path string) (*elastic.GetResult, error) {
return c.Get(c.ProjectID + "_" + path)
}
func (c *Client) Remove(id string) {
req := elastic.NewBulkDeleteRequest().
Index(c.IndexName).
Type("repository").
Parent(c.ProjectID).
Id(id)
c.bulk.Add(req)
}
package elastic_test
import (
"crypto/tls"
"fmt"
"net/http"
"net/http/httptest"
"os"
"regexp"
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/elastic"
)
const (
projectID = "667"
)
func TestAWSConfiguration(t *testing.T) {
var req *http.Request
// httptest certificate is unsigned
transport := http.DefaultTransport
defer func() { http.DefaultTransport = transport }()
http.DefaultTransport = &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}
f := func(w http.ResponseWriter, r *http.Request) {
req = r
w.Header().Set("Content-Type", "application/json")
w.Write([]byte(`{}`))
}
srv := httptest.NewTLSServer(http.HandlerFunc(f))
defer srv.Close()
config, err := elastic.ReadConfig(strings.NewReader(
`{
"url":["` + srv.URL + `"],
"aws":true,
"aws_region": "us-east-1",
"aws_access_key": "0",
"aws_secret_access_key": "0"
}`,
))
assert.NoError(t, err)
client, err := elastic.NewClient(config)
assert.NoError(t, err)
defer client.Close()
if assert.NotNil(t, req) {
authRE := regexp.MustCompile(`\AAWS4-HMAC-SHA256 Credential=0/\d{8}/us-east-1/es/aws4_request, SignedHeaders=accept;date;host;x-amz-date, Signature=[a-f0-9]{64}\z`)
assert.Regexp(t, authRE, req.Header.Get("Authorization"))
assert.NotEqual(t, "", req.Header.Get("X-Amz-Date"))
}
}
func TestElasticClientIndexAndRetrieval(t *testing.T) {
config := os.Getenv("ELASTIC_CONNECTION_INFO")
if config == "" {
t.Log("ELASTIC_CONNECTION_INFO not set")
t.SkipNow()
}
os.Setenv("RAILS_ENV", fmt.Sprintf("test-elastic-%d", time.Now().Unix()))
client, err := elastic.FromEnv(projectID)
assert.NoError(t, err)
assert.Equal(t, projectID, client.ParentID())
assert.NoError(t, client.CreateIndex())
blobDoc := map[string]interface{}{}
client.Index(projectID+"_foo", blobDoc)
commitDoc := map[string]interface{}{}
client.Index(projectID+"_0000", commitDoc)
assert.NoError(t, client.Flush())
blob, err := client.GetBlob("foo")
assert.NoError(t, err)
assert.Equal(t, true, blob.Found)
commit, err := client.GetCommit("0000")
assert.NoError(t, err)
assert.Equal(t, true, commit.Found)
client.Remove(projectID + "_foo")
assert.NoError(t, client.Flush())
_, err = client.GetBlob("foo")
assert.Error(t, err)
assert.NoError(t, client.DeleteIndex())
}
package elastic
import (
"encoding/json"
"io"
)
type Config struct {
IndexName string `json:"-"`
ProjectID string `json:"-"`
URL []string `json:"url"`
AWS bool `json:"aws"`
Region string `json:"aws_region"`
AccessKey string `json:"aws_access_key"`
SecretKey string `json:"aws_secret_access_key"`
}
func ReadConfig(r io.Reader) (*Config, error) {
var out Config
if err := json.NewDecoder(r).Decode(&out); err != nil {
return nil, err
}
return &out, nil
}
package elastic
import (
"context"
)
var indexMapping = `
{
"settings": {
"analysis": {
"filter": {
"my_stemmer": {
"name": "light_english",
"type": "stemmer"
},
"code": {
"type": "pattern_capture",
"preserve_original": "1",
"patterns": [
"(\\p{Ll}+|\\p{Lu}\\p{Ll}+|\\p{Lu}+)",
"(\\d+)"
]
}
},
"char_filter": {
"code_mapping": {
"type": "mapping",
"mappings": [
". => ' '"
]
}
},
"analyzer": {
"default": {
"filter": [
"standard",
"lowercase",
"my_stemmer"
],
"tokenizer": "standard"
},
"code_search_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"char_filter": [
"code_mapping"
],
"type": "custom",
"tokenizer": "standard"
},
"path_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"type": "custom",
"tokenizer": "path_tokenizer"
},
"sha_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"type": "custom",
"tokenizer": "sha_tokenizer"
},
"code_analyzer": {
"filter": [
"code",
"lowercase",
"asciifolding"
],
"char_filter": [
"code_mapping"
],
"type": "custom",
"tokenizer": "standard"
},
"my_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "my_ngram_tokenizer"
}
},
"tokenizer": {
"my_ngram_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "2",
"type": "nGram",
"max_gram": "3"
},
"sha_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "5",
"type": "edgeNGram",
"max_gram": "40"
},
"path_tokenizer": {
"reverse": "true",
"type": "path_hierarchy"
}
}
}
},
"mappings": {
"milestone": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"created_at": {
"type": "date"
},
"description": {
"type": "text",
"index_options": "offsets"
},
"id": {
"type": "integer"
},
"project_id": {
"type": "integer"
},
"title": {
"type": "text",
"index_options": "offsets"
},
"updated_at": {
"type": "date"
}
}
},
"note": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"created_at": {
"type": "date"
},
"id": {
"type": "integer"
},
"issue": {
"properties": {
"assignee_id": {
"type": "integer"
},
"author_id": {
"type": "integer"
},
"confidential": {
"type": "boolean"
}
}
},
"note": {
"type": "text",
"index_options": "offsets"
},
"noteable_id": {
"type": "integer"
},
"noteable_type": {
"type": "keyword"
},
"project_id": {
"type": "integer"
},
"updated_at": {
"type": "date"
}
}
},
"project_wiki": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"blob": {
"properties": {
"commit_sha": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"content": {
"type": "text",
"index_options": "offsets",
"analyzer": "code_analyzer",
"search_analyzer": "code_search_analyzer"
},
"file_name": {
"type": "text",
"analyzer": "code_analyzer",
"search_analyzer": "code_search_analyzer"
},
"id": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"language": {
"type": "keyword"
},
"oid": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"path": {
"type": "text",
"analyzer": "path_analyzer"
},
"rid": {
"type": "keyword"
}
}
},
"commit": {
"properties": {
"author": {
"properties": {
"email": {
"type": "text",
"index_options": "offsets"
},
"name": {
"type": "text",
"index_options": "offsets"
},
"time": {
"type": "date",
"format": "basic_date_time_no_millis"
}
}
},
"commiter": {
"properties": {
"email": {
"type": "text",
"index_options": "offsets"
},
"name": {
"type": "text",
"index_options": "offsets"
},
"time": {
"type": "date",
"format": "basic_date_time_no_millis"
}
}
},
"id": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"message": {
"type": "text",
"index_options": "offsets"
},
"rid": {
"type": "keyword"
},
"sha": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
}
}
}
}
},
"issue": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"assignee_id": {
"type": "integer"
},
"author_id": {
"type": "integer"
},
"confidential": {
"type": "boolean"
},
"created_at": {
"type": "date"
},
"description": {
"type": "text",
"index_options": "offsets"
},
"id": {
"type": "integer"
},
"iid": {
"type": "integer"
},
"project_id": {
"type": "integer"
},
"state": {
"type": "text"
},
"title": {
"type": "text",
"index_options": "offsets"
},
"updated_at": {
"type": "date"
}
}
},
"merge_request": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"author_id": {
"type": "integer"
},
"created_at": {
"type": "date"
},
"description": {
"type": "text",
"index_options": "offsets"
},
"id": {
"type": "integer"
},
"iid": {
"type": "integer"
},
"merge_status": {
"type": "text"
},
"source_branch": {
"type": "text",
"index_options": "offsets"
},
"source_project_id": {
"type": "integer"
},
"state": {
"type": "text"
},
"target_branch": {
"type": "text",
"index_options": "offsets"
},
"target_project_id": {
"type": "integer"
},
"title": {
"type": "text",
"index_options": "offsets"
},
"updated_at": {
"type": "date"
}
}
},
"repository": {
"_parent": {
"type": "project"
},
"_routing": {
"required": true
},
"properties": {
"blob": {
"properties": {
"commit_sha": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"content": {
"type": "text",
"index_options": "offsets",
"analyzer": "code_analyzer",
"search_analyzer": "code_search_analyzer"
},
"file_name": {
"type": "text",
"analyzer": "code_analyzer",
"search_analyzer": "code_search_analyzer"
},
"id": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"language": {
"type": "keyword"
},
"oid": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"path": {
"type": "text",
"analyzer": "path_analyzer"
},
"rid": {
"type": "keyword"
},
"type": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"commit": {
"properties": {
"author": {
"properties": {
"email": {
"type": "text",
"index_options": "offsets"
},
"name": {
"type": "text",
"index_options": "offsets"
},
"time": {
"type": "date",
"format": "basic_date_time_no_millis"
}
}
},
"commiter": {
"properties": {
"email": {
"type": "text",
"index_options": "offsets"
},
"name": {
"type": "text",
"index_options": "offsets"
},
"time": {
"type": "date",
"format": "basic_date_time_no_millis"
}
}
},
"committer": {
"properties": {
"email": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"time": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"id": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"message": {
"type": "text",
"index_options": "offsets"
},
"rid": {
"type": "keyword"
},
"sha": {
"type": "text",
"index_options": "offsets",
"analyzer": "sha_analyzer"
},
"type": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
},
"project": {
"properties": {
"archived": {
"type": "boolean"
},
"created_at": {
"type": "date"
},
"description": {
"type": "text",
"index_options": "offsets"
},
"id": {
"type": "integer"
},
"issues_access_level": {
"type": "integer"
},
"last_activity_at": {
"type": "date"
},
"last_pushed_at": {
"type": "date"
},
"merge_requests_access_level": {
"type": "integer"
},
"name": {
"type": "text",
"index_options": "offsets"
},
"name_with_namespace": {
"type": "text",
"index_options": "offsets",
"analyzer": "my_ngram_analyzer"
},
"namespace_id": {
"type": "integer"
},
"path": {
"type": "text",
"index_options": "offsets"
},
"path_with_namespace": {
"type": "text",
"index_options": "offsets"
},
"repository_access_level": {
"type": "integer"
},
"snippets_access_level": {
"type": "integer"
},
"updated_at": {
"type": "date"
},
"visibility_level": {
"type": "integer"
},
"wiki_access_level": {
"type": "integer"
}
}
},
"snippet": {
"properties": {
"author_id": {
"type": "integer"
},
"content": {
"type": "text",
"index_options": "offsets"
},
"created_at": {
"type": "date"
},
"file_name": {
"type": "text",
"index_options": "offsets"
},
"id": {
"type": "integer"
},
"project_id": {
"type": "integer"
},
"state": {
"type": "text"
},
"title": {
"type": "text",
"index_options": "offsets"
},
"updated_at": {
"type": "date"
},
"visibility_level": {
"type": "integer"
}
}
}
}
}
`
// CreateIndex creates an index matching that created by gitlab-elasticsearch-git v1.1.1
func (c *Client) CreateIndex() error {
createIndex, err := c.Client.CreateIndex(c.IndexName).BodyString(indexMapping).Do(context.Background())
if err != nil {
return err
}
if !createIndex.Acknowledged {
return timeoutError
}
return nil
}
func (c *Client) DeleteIndex() error {
deleteIndex, err := c.Client.DeleteIndex(c.IndexName).Do(context.Background())
if err != nil {
return err
}
if !deleteIndex.Acknowledged {
return timeoutError
}
return nil
}
package git
import (
"fmt"
"gopkg.in/src-d/go-git.v4"
"gopkg.in/src-d/go-git.v4/plumbing"
"gopkg.in/src-d/go-git.v4/plumbing/filemode"
"gopkg.in/src-d/go-git.v4/plumbing/object"
"gopkg.in/src-d/go-git.v4/utils/merkletrie"
)
var (
endError = fmt.Errorf("Finished") // not really an error
)
type goGitRepository struct {
*git.Repository
FromHash plumbing.Hash
ToHash plumbing.Hash
FromCommit *object.Commit
ToCommit *object.Commit
}
func NewGoGitRepository(projectPath string, fromSHA string, toSHA string) (*goGitRepository, error) {
out := &goGitRepository{}
repo, err := git.PlainOpen(projectPath)
if err != nil {
return nil, err
}
out.Repository = repo
if fromSHA == "" {
out.FromHash = plumbing.ZeroHash
} else {
out.FromHash = plumbing.NewHash(fromSHA)
commit, err := repo.CommitObject(out.FromHash)
if err != nil {
return nil, fmt.Errorf("Bad from SHA (%s): %s", out.FromHash, err)
}
out.FromCommit = commit
}
if toSHA == "" {
ref, err := out.Repository.Head()
if err != nil {
return nil, err
}
out.ToHash = ref.Hash()
} else {
out.ToHash = plumbing.NewHash(toSHA)
}
commit, err := out.Repository.CommitObject(out.ToHash)
if err != nil {
return nil, fmt.Errorf("Bad to SHA (%s): %s", out.ToHash, err)
}
out.ToCommit = commit
return out, nil
}
func (r *goGitRepository) diff() (object.Changes, error) {
var fromTree, toTree *object.Tree
if r.FromCommit != nil {
tree, err := r.FromCommit.Tree()
if err != nil {
return nil, err
}
fromTree = tree
}
toTree, err := r.ToCommit.Tree()
if err != nil {
return nil, err
}
return object.DiffTree(fromTree, toTree)
}
func goGitBuildSignature(sig object.Signature) Signature {
return Signature{
Name: sig.Name,
Email: sig.Email,
When: sig.When,
}
}
func goGitBuildFile(change object.ChangeEntry, file *object.File) *File {
return &File{
Path: change.Name,
Oid: file.ID().String(),
Blob: file.Blob.Reader,
Size: file.Size,
}
}
func (r *goGitRepository) EachFileChange(ins, mod, del FileFunc) error {
changes, err := r.diff()
if err != nil {
return err
}
fromCommitStr := r.FromHash.String()
toCommitStr := r.ToHash.String()
for _, change := range changes {
// FIXME(nick): submodules may need better support
// https://github.com/src-d/go-git/issues/317
if change.From.TreeEntry.Mode == filemode.Submodule || change.To.TreeEntry.Mode == filemode.Submodule {
continue
}
fromF, toF, err := change.Files()
if err != nil {
return err
}
action, err := change.Action()
if err != nil {
return err
}
switch action {
case merkletrie.Insert:
err = ins(goGitBuildFile(change.To, toF), fromCommitStr, toCommitStr)
case merkletrie.Modify:
err = mod(goGitBuildFile(change.To, toF), fromCommitStr, toCommitStr)
case merkletrie.Delete:
err = del(goGitBuildFile(change.From, fromF), fromCommitStr, toCommitStr)
default:
err = fmt.Errorf("Unrecognised change calculating diff: %+v", change)
}
if err != nil {
return err
}
}
return nil
}
// EachCommit runs `f` for each commit within `fromSHA`..`toSHA`
// go-git doesn't directly support ranges of revisions, so we emulate this by
// walking the commit history between from and to
func (r *goGitRepository) EachCommit(f CommitFunc) error {
err := object.WalkCommitHistoryPost(r.ToCommit, func(c *object.Commit) error {
if r.FromCommit != nil && c.ID() == r.FromCommit.ID() {
return endError
}
commit := &Commit{
Message: c.Message,
Hash: c.Hash.String(),
Author: goGitBuildSignature(c.Author),
Committer: goGitBuildSignature(c.Committer),
}
if err := f(commit); err != nil {
return err
}
return nil
})
if err != nil && err != endError {
return fmt.Errorf("WalkCommitHistory: %s", err)
}
return nil
}
package git
import (
"io"
"time"
)
type File struct {
Path string
Blob func() (io.ReadCloser, error)
Oid string
Size int64
}
type Signature struct {
Name string
Email string
When time.Time
}
type Commit struct {
Author Signature
Committer Signature
Message string
Hash string
}
type Repository interface {
EachFileChange(ins, mod, del FileFunc) error
EachCommit(f CommitFunc) error
}
type FileFunc func(file *File, fromCommit, toCommit string) error
type CommitFunc func(commit *Commit) error
package git_test
import (
"flag"
"io/ioutil"
"os"
"sort"
"testing"
"time"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/git"
)
var (
testRepo = flag.String("test-repo", "../tmp/gitlab-test.git", "Path to `gitlab-test` repository for integration tests")
)
const (
initialSHA = "1a0b36b3cdad1d2ee32457c102a8c0b7056fa863"
headSHA = "b83d6e391c22777fca1ed3012fce84f633d7fed0"
)
func checkDeps(t *testing.T) {
if _, err := os.Stat(*testRepo); err != nil {
t.Log("No test repo found at ", *testRepo)
t.SkipNow()
}
}
func runEachCommit(repo git.Repository) (map[string]*git.Commit, []string, error) {
commits := make(map[string]*git.Commit)
commitHashes := []string{}
err := repo.EachCommit(func(commit *git.Commit) error {
commits[commit.Hash] = commit
commitHashes = append(commitHashes, commit.Hash)
return nil
})
return commits, commitHashes, err
}
func TestEachCommit(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "", headSHA)
assert.NoError(t, err)
commits, commitHashes, err := runEachCommit(repo)
assert.NoError(t, err)
expectedCommits := []string{
"b83d6e391c22777fca1ed3012fce84f633d7fed0",
"498214de67004b1da3d820901307bed2a68a8ef6",
"1b12f15a11fc6e62177bef08f47bc7b5ce50b141",
"38008cb17ce1466d8fec2dfa6f6ab8dcfe5cf49e",
"6907208d755b60ebeacb2e9dfea74c92c3449a1f",
"c347ca2e140aa667b968e51ed0ffe055501fe4f4",
"d59c60028b053793cecfb4022de34602e1a9218e",
"281d3a76f31c812dbf48abce82ccf6860adedd81",
"a5391128b0ef5d21df5dd23d98557f4ef12fae20",
"54fcc214b94e78d7a41a9a8fe6d87a5e59500e51",
"be93687618e4b132087f430a4d8fc3a609c9b77c",
"048721d90c449b244b7b4c53a9186b04330174ec",
"5f923865dde3436854e9ceb9cdb7815618d4e849",
"d2d430676773caa88cdaf7c55944073b2fd5561a",
"2ea1f3dec713d940208fb5ce4a38765ecb5d3f73",
"59e29889be61e6e0e5e223bfa9ac2721d31605b8",
"66eceea0db202bb39c4e445e8ca28689645366c5",
"08f22f255f082689c0d7d39d19205085311542bc",
"19e2e9b4ef76b422ce1154af39a91323ccc57434",
"c642fe9b8b9f28f9225d7ea953fe14e74748d53b",
"9a944d90955aaf45f6d0c88f30e27f8d2c41cec0",
"c7fbe50c7c7419d9701eebe64b1fdacc3df5b9dd",
"e56497bb5f03a90a51293fc6d516788730953899",
"4cd80ccab63c82b4bad16faa5193fbd2aa06df40",
"5937ac0a7beb003549fc5fd26fc247adbce4a52e",
"570e7b2abdd848b95f2f578043fc23bd6f6fd24d",
"6f6d7e7ed97bb5f0054f2b1df789b39ca89b6ff9",
"d14d6c0abdd253381df51a723d58691b2ee1ab08",
"c1acaa58bbcbc3eafe538cb8274ba387047b69f8",
"ae73cb07c9eeaf35924a10f713b364d32b2dd34f",
"874797c3a73b60d2187ed6e2fcabd289ff75171e",
"2f63565e7aac07bcdadb654e253078b727143ec4",
"33f3729a45c02fc67d00adb1b8bca394b0e761d9",
"913c66a37b4a45b9769037c55c2d238bd0942d2e",
"cfe32cf61b73a0d5e9f13e774abde7ff789b1660",
"6d394385cf567f80a8fd85055db1ab4c5295806f",
"1a0b36b3cdad1d2ee32457c102a8c0b7056fa863",
}
// We don't mind the order these are given in
sort.Strings(expectedCommits)
sort.Strings(commitHashes)
assert.Equal(t, expectedCommits, commitHashes)
// Now choose one commit and check it in detail
commit := commits[initialSHA]
date, err := time.Parse("Mon Jan 02 15:04:05 2006 -0700", "Thu Feb 27 00:03:18 2014 -0800")
assert.NoError(t, err)
dmitriy := git.Signature{
Name: "Dmitriy Zaporozhets",
Email: "dmitriy.zaporozhets@gmail.com",
When: date,
}
assert.Equal(t, initialSHA, commit.Hash)
assert.Equal(t, "Initial commit\n", commit.Message)
assert.Equal(t, dmitriy, commit.Author)
assert.Equal(t, dmitriy, commit.Author)
}
func TestEachCommitGivenRangeOf3Commits(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "1b12f15a11fc6e62177bef08f47bc7b5ce50b141", headSHA)
assert.NoError(t, err)
_, commitHashes, err := runEachCommit(repo)
assert.NoError(t, err)
expected := []string{"498214de67004b1da3d820901307bed2a68a8ef6", headSHA}
sort.Strings(expected)
sort.Strings(commitHashes)
assert.Equal(t, expected, commitHashes)
}
func TestEachCommitGivenRangeOf2Commits(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "498214de67004b1da3d820901307bed2a68a8ef6", headSHA)
assert.NoError(t, err)
_, commitHashes, err := runEachCommit(repo)
assert.NoError(t, err)
assert.Equal(t, []string{headSHA}, commitHashes)
}
func TestEachCommitGivenRangeOf1Commit(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, headSHA, headSHA)
assert.NoError(t, err)
_, commitHashes, err := runEachCommit(repo)
assert.NoError(t, err)
assert.Equal(t, []string{}, commitHashes)
}
func TestEmptyToSHADefaultsToHeadSHA(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "498214de67004b1da3d820901307bed2a68a8ef6", "")
assert.NoError(t, err)
_, commitHashes, err := runEachCommit(repo)
assert.NoError(t, err)
assert.Equal(t, []string{headSHA}, commitHashes)
}
// TODO: store the ins/mod/del status of each change
func runEachFileChange(repo git.Repository) (map[string]*git.File, []string, error) {
files := make(map[string]*git.File)
filePaths := []string{}
store := func(f *git.File, _, _ string) error {
files[f.Path] = f
filePaths = append(filePaths, f.Path)
return nil
}
err := repo.EachFileChange(store, store, store)
return files, filePaths, err
}
func TestEachFileChangeMod(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "", headSHA)
assert.NoError(t, err)
files, filePaths, err := runEachFileChange(repo)
assert.NoError(t, err)
expectedFiles := []string{
".gitattributes",
".gitignore",
".gitmodules",
"CHANGELOG",
"CONTRIBUTING.md",
"Gemfile.zip",
"LICENSE",
"MAINTENANCE.md",
"PROCESS.md",
"README",
"README.md",
"VERSION",
"bar/branch-test.txt",
"custom-highlighting/test.gitlab-custom",
"encoding/feature-1.txt",
"encoding/feature-2.txt",
"encoding/hotfix-1.txt",
"encoding/hotfix-2.txt",
"encoding/iso8859.txt",
"encoding/russian.rb",
"encoding/test.txt",
"encoding/テスト.txt",
"encoding/テスト.xls",
"files/html/500.html",
"files/images/6049019_460s.jpg",
"files/images/logo-black.png",
"files/images/logo-white.png",
"files/images/wm.svg",
"files/js/application.js",
"files/js/commit.coffee",
"files/lfs/lfs_object.iso",
"files/markdown/ruby-style-guide.md",
"files/ruby/popen.rb",
"files/ruby/regex.rb",
"files/ruby/version_info.rb",
"files/whitespace",
"foo/bar/.gitkeep",
"with space/README.md",
}
// We don't mind the order these are given in
sort.Strings(expectedFiles)
sort.Strings(filePaths)
assert.Equal(t, expectedFiles, filePaths)
// Now choose one file and check it in detail
file := files["VERSION"]
blob, err := file.Blob()
assert.NoError(t, err)
data, err := ioutil.ReadAll(blob)
assert.NoError(t, err)
assert.Equal(t, "VERSION", file.Path)
assert.Equal(t, "998707b421c89bd9a3063333f9f728ef3e43d101", file.Oid)
assert.Equal(t, int64(10), file.Size)
assert.Equal(t, "6.7.0.pre\n", string(data))
}
func TestEachFileChangeGivenRangeOfThreeCommits(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "1b12f15a11fc6e62177bef08f47bc7b5ce50b141", headSHA)
assert.NoError(t, err)
_, filePaths, err := runEachFileChange(repo)
assert.Equal(t, []string{"bar/branch-test.txt"}, filePaths)
}
func TestEachFileChangeGivenRangeOfTwoCommits(t *testing.T) {
checkDeps(t)
repo, err := git.NewGoGitRepository(*testRepo, "498214de67004b1da3d820901307bed2a68a8ef6", headSHA)
assert.NoError(t, err)
_, filePaths, err := runEachFileChange(repo)
assert.Equal(t, []string{}, filePaths)
}
package indexer
import (
"bytes"
"fmt"
"io/ioutil"
"gitlab.com/gitlab-org/es-git-go/git"
"gitlab.com/gitlab-org/es-git-go/linguist"
)
var (
SkipTooLargeBlob = fmt.Errorf("Blob should be skipped: Too large")
SkipBinaryBlob = fmt.Errorf("Blob should be skipped: binary")
)
const (
binarySearchLimit = 8 * 1024 // 8 KiB, Same as git
maxBlobSize = 1024 * 1024 // 1MiB, same as gitlab-elasticsearch-git
)
func isSkipBlobErr(err error) bool {
switch err {
case SkipTooLargeBlob:
return true
case SkipBinaryBlob:
return true
}
return false
}
type Blob struct {
Type string `json:"type"`
ID string `json:"-"`
OID string `json:"oid"`
RepoID string `json:"rid"`
CommitSHA string `json:"commit_sha"`
Content string `json:"content"`
Path string `json:"path"`
// Message copied from gitlab-elasticsearch-git:
//
// We're duplicating file_name parameter here because we need another
// analyzer for it.
//
//Ideally this should be done with copy_to: 'blob.file_name' option
//but it does not work in ES v2.3.*. We're doing it so to not make users
//install newest versions
//
//https://github.com/elastic/elasticsearch-mapper-attachments/issues/124
Filename string `json:"file_name"`
Language string `json:"language"`
}
func GenerateBlobID(parentID, filename string) string {
return fmt.Sprintf("%s_%s", parentID, filename)
}
func BuildBlob(file *git.File, parentID, commitSHA string) (*Blob, error) {
if file.Size > maxBlobSize {
return nil, SkipTooLargeBlob
}
reader, err := file.Blob()
if err != nil {
return nil, err
}
defer reader.Close()
// FIXME(nick): This doesn't look cheap. Check the RAM & CPU pressure, esp.
// for large blobs
b, err := ioutil.ReadAll(reader)
if err != nil {
return nil, err
}
if DetectBinary(b) {
return nil, SkipBinaryBlob
}
content := tryEncodeBytes(b)
filename := tryEncodeString(file.Path)
return &Blob{
Type: "blob",
ID: GenerateBlobID(parentID, filename),
OID: file.Oid,
RepoID: parentID,
CommitSHA: commitSHA,
Content: content,
Path: filename,
Filename: filename,
Language: DetectLanguage(filename, b),
}, nil
}
// DetectLanguage returns a string describing the language of the file. This is
// programming language, rather than natural language.
//
// If no language is detected, "Text" is returned.
func DetectLanguage(filename string, data []byte) string {
lang := linguist.DetectLanguage(filename, data)
if lang != nil {
return lang.Name
}
return "Text"
}
// DetectBinary checks whether the passed-in data contains a NUL byte. Only scan
// the start of large blobs. This is the same test performed by git to check
// text/binary
func DetectBinary(data []byte) bool {
searchLimit := binarySearchLimit
if len(data) < searchLimit {
searchLimit = len(data)
}
return bytes.Contains(data[:searchLimit], []byte{0})
}
package indexer_test
import (
"encoding/json"
"testing"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/indexer"
)
func TestBuildBlob(t *testing.T) {
file := gitFile("foo/bar", "foo")
expected := validBlob(file, "foo", "Text")
actual, err := indexer.BuildBlob(file, expected.RepoID, expected.CommitSHA)
assert.NoError(t, err)
assert.Equal(t, expected, actual)
expectedJSON := `{
"commit_sha" : "` + expected.CommitSHA + `",
"content" : "` + expected.Content + `",
"file_name" : "` + expected.Filename + `",
"language" : "` + expected.Language + `",
"oid" : "` + expected.OID + `",
"path" : "` + expected.Path + `",
"rid" : "` + expected.RepoID + `",
"type" : "blob"
}`
actualJSON, err := json.Marshal(actual)
assert.NoError(t, err)
assert.JSONEq(t, expectedJSON, string(actualJSON))
}
func TestBuildBlobSkipsLargeBlobs(t *testing.T) {
file := gitFile("foo/bar", "foo")
file.Size = 1024*1024 + 1
blob, err := indexer.BuildBlob(file, parentID, sha)
assert.Error(t, err, indexer.SkipTooLargeBlob)
assert.Nil(t, blob)
}
func TestBuildBlobSkipsBinaryBlobs(t *testing.T) {
file := gitFile("foo/bar", "foo\x00")
blob, err := indexer.BuildBlob(file, parentID, sha)
assert.Equal(t, err, indexer.SkipBinaryBlob)
assert.Nil(t, blob)
}
func TestBuildBlobDetectsLanguageByFilename(t *testing.T) {
file := gitFile("Makefile.am", "foo")
blob, err := indexer.BuildBlob(file, parentID, sha)
assert.NoError(t, err)
assert.Equal(t, "Makefile", blob.Language)
}
func TestBuildBlobDetectsLanguageByExtension(t *testing.T) {
file := gitFile("foo.rb", "foo")
blob, err := indexer.BuildBlob(file, parentID, sha)
assert.NoError(t, err)
assert.Equal(t, "Ruby", blob.Language)
}
func TestGenerateBlobID(t *testing.T) {
assert.Equal(t, "projectID_path", indexer.GenerateBlobID("projectID", "path"))
}
package indexer
import (
"fmt"
"gitlab.com/gitlab-org/es-git-go/git"
)
type Commit struct {
Type string `json:"type"`
ID string `json:"-"`
Author *Person `json:"author"`
Committer *Person `json:"committer"`
RepoID string `json:"rid"`
Message string `json:"message"`
SHA string `json:"sha"`
}
func GenerateCommitID(parentID, commitSHA string) string {
return fmt.Sprintf("%s_%s", parentID, commitSHA)
}
func BuildCommit(c *git.Commit, parentID string) *Commit {
sha := c.Hash
return &Commit{
Type: "commit",
Author: BuildPerson(c.Author),
Committer: BuildPerson(c.Committer),
ID: GenerateCommitID(parentID, sha),
RepoID: parentID,
Message: tryEncodeString(c.Message),
SHA: sha,
}
}
package indexer_test
import (
"encoding/json"
"testing"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/indexer"
)
func TestBuildCommit(t *testing.T) {
gitCommit := gitCommit("Initial commit")
expected := validCommit(gitCommit)
actual := indexer.BuildCommit(gitCommit, expected.RepoID)
assert.Equal(t, expected, actual)
expectedJSON := `{
"sha" : "` + expected.SHA + `",
"message" : "` + expected.Message + `",
"author" : {
"name": "` + expected.Author.Name + `",
"email": "` + expected.Author.Email + `",
"time": "` + indexer.GenerateDate(gitCommit.Author.When) + `"
},
"committer" : {
"name": "` + expected.Committer.Name + `",
"email": "` + expected.Committer.Email + `",
"time": "` + indexer.GenerateDate(gitCommit.Committer.When) + `"
},
"rid" : "` + expected.RepoID + `",
"type" : "commit"
}`
actualJSON, err := json.Marshal(actual)
assert.NoError(t, err)
assert.JSONEq(t, expectedJSON, string(actualJSON))
}
func TestGenerateCommitID(t *testing.T) {
assert.Equal(t, "projectID_sha", indexer.GenerateCommitID("projectID", "sha"))
}
package indexer
import (
"fmt"
"log"
"github.com/lupine/icu"
)
var (
detector *icu.CharsetDetector
converter = icu.NewCharsetConverter(maxBlobSize)
)
func init() {
var err error
detector, err = icu.NewCharsetDetector()
if err != nil {
panic(err)
}
}
func tryEncodeString(s string) string {
encoded, err := encodeString(s)
if err != nil {
log.Println(err)
return s // TODO: Run it through the UTF-8 replacement encoder
}
return encoded
}
func tryEncodeBytes(b []byte) string {
encoded, err := encodeBytes(b)
if err != nil {
log.Println(err)
s := string(b)
return s // TODO: Run it through the UTF-8 replacement encoder
}
return encoded
}
func encodeString(s string) (string, error) {
return encodeBytes([]byte(s))
}
// encodeString converts a string from an arbitrary encoding to UTF-8
func encodeBytes(b []byte) (string, error) {
if len(b) == 0 {
return "", nil
}
matches, err := detector.GuessCharset(b)
if err != nil {
return "", fmt.Errorf("Couldn't guess charset: %s", err)
}
// Try encoding for each match, returning the first that succeeds
for _, match := range matches {
utf8, err := converter.ConvertToUtf8(b, match.Charset)
if err == nil {
return string(utf8), nil
}
}
return "", fmt.Errorf("Failed to convert from %s to UTF-8", matches[0].Charset)
}
package indexer
import (
"fmt"
"log"
"gitlab.com/gitlab-org/es-git-go/git"
)
type Submitter interface {
ParentID() string
Index(id string, thing interface{})
Remove(id string)
Flush() error
}
type Indexer struct {
git.Repository
Submitter
}
func (i *Indexer) SubmitCommit(c *git.Commit) error {
commit := BuildCommit(c, i.Submitter.ParentID())
i.Submitter.Index(commit.ID, map[string]interface{}{"commit": commit})
return nil
}
func (i *Indexer) SubmitBlob(f *git.File, _, toCommit string) error {
blob, err := BuildBlob(f, i.Submitter.ParentID(), toCommit)
if err != nil {
if isSkipBlobErr(err) {
return nil
}
return fmt.Errorf("Blob %s: %s", f.Path, err)
}
i.Submitter.Index(blob.ID, map[string]interface{}{"blob": blob})
return nil
}
func (i *Indexer) RemoveBlob(file *git.File, _, _ string) error {
blobID := GenerateBlobID(i.Submitter.ParentID(), file.Path)
i.Submitter.Remove(blobID)
return nil
}
func (i *Indexer) IndexCommits() error {
return i.Repository.EachCommit(i.SubmitCommit)
}
func (i *Indexer) IndexBlobs() error {
return i.Repository.EachFileChange(i.SubmitBlob, i.SubmitBlob, i.RemoveBlob)
}
func (i *Indexer) Index() error {
if err := i.IndexBlobs(); err != nil {
log.Print("Error while indexing blobs: ", err)
return err
}
if err := i.IndexCommits(); err != nil {
log.Print("Error while indexing commits: ", err)
return err
}
if err := i.Submitter.Flush(); err != nil {
log.Print("Error while flushing requests: ", err)
}
return nil
}
package indexer_test
import (
"fmt"
"io"
"io/ioutil"
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/git"
"gitlab.com/gitlab-org/es-git-go/indexer"
)
const (
sha = "9876543210987654321098765432109876543210"
oid = "0123456789012345678901234567890123456789"
parentID = "667"
)
type fakeSubmitter struct {
flushed int
indexed int
indexedID []string
indexedThing []interface{}
removed int
removedID []string
}
type fakeRepository struct {
commits []*git.Commit
added []*git.File
modified []*git.File
removed []*git.File
}
func (f *fakeSubmitter) ParentID() string {
return parentID
}
func (f *fakeSubmitter) Index(id string, thing interface{}) {
f.indexed++
f.indexedID = append(f.indexedID, id)
f.indexedThing = append(f.indexedThing, thing)
}
func (f *fakeSubmitter) Remove(id string) {
f.removed++
f.removedID = append(f.removedID, id)
}
func (f *fakeSubmitter) Flush() error {
f.flushed++
return nil
}
func (r *fakeRepository) EachFileChange(ins, mod, del git.FileFunc) error {
for _, file := range r.added {
if err := ins(file, sha, sha); err != nil {
return err
}
}
for _, file := range r.modified {
if err := mod(file, sha, sha); err != nil {
return err
}
}
for _, file := range r.removed {
if err := del(file, sha, sha); err != nil {
return err
}
}
return nil
}
func (r *fakeRepository) EachCommit(f git.CommitFunc) error {
for _, commit := range r.commits {
if err := f(commit); err != nil {
return err
}
}
return nil
}
func setupIndexer() (*indexer.Indexer, *fakeRepository, *fakeSubmitter) {
repo := &fakeRepository{}
submitter := &fakeSubmitter{}
return &indexer.Indexer{
Repository: repo,
Submitter: submitter,
}, repo, submitter
}
func readerFunc(data string, err error) func() (io.ReadCloser, error) {
return func() (io.ReadCloser, error) {
return ioutil.NopCloser(strings.NewReader(data)), err
}
}
func gitFile(path, content string) *git.File {
return &git.File{
Path: path,
Blob: readerFunc(content, nil),
Size: int64(len(content)),
Oid: oid,
}
}
func gitCommit(message string) *git.Commit {
return &git.Commit{
Author: git.Signature{
Email: "job@gitlab.com",
Name: "Job van der Voort",
When: time.Date(2016, time.September, 27, 14, 37, 46, 0, time.UTC),
},
Committer: git.Signature{
Email: "nick@gitlab.com",
Name: "Nick Thomas",
When: time.Date(2017, time.October, 28, 15, 38, 47, 1, time.UTC),
},
Message: message,
Hash: sha,
}
}
func validBlob(file *git.File, content, language string) *indexer.Blob {
return &indexer.Blob{
Type: "blob",
ID: indexer.GenerateBlobID(parentID, file.Path),
OID: oid,
RepoID: parentID,
CommitSHA: sha,
Content: content,
Path: file.Path,
Filename: file.Path,
Language: language,
}
}
func validCommit(gitCommit *git.Commit) *indexer.Commit {
return &indexer.Commit{
Type: "commit",
ID: indexer.GenerateCommitID(parentID, gitCommit.Hash),
Author: indexer.BuildPerson(gitCommit.Author),
Committer: indexer.BuildPerson(gitCommit.Committer),
RepoID: parentID,
Message: gitCommit.Message,
SHA: sha,
}
}
func TestIndex(t *testing.T) {
idx, repo, submit := setupIndexer()
gitCommit := gitCommit("Initial commit")
gitAdded := gitFile("foo/bar", "added file")
gitModified := gitFile("foo/baz", "modified file")
gitRemoved := gitFile("foo/qux", "removed file")
gitTooBig := gitFile("invalid/too-big", "")
gitTooBig.Size = int64(1024*1024 + 1)
gitBinary := gitFile("invalid/binary", "foo\x00")
commit := validCommit(gitCommit)
added := validBlob(gitAdded, "added file", "Text")
modified := validBlob(gitModified, "modified file", "Text")
removed := validBlob(gitRemoved, "removed file", "Text")
repo.commits = append(repo.commits, gitCommit)
repo.added = append(repo.added, gitAdded, gitTooBig, gitBinary)
repo.modified = append(repo.modified, gitModified)
repo.removed = append(repo.removed, gitRemoved)
idx.Index()
assert.Equal(t, submit.indexed, 3)
assert.Equal(t, submit.removed, 1)
assert.Equal(t, parentID+"_"+added.Path, submit.indexedID[0])
assert.Equal(t, map[string]interface{}{"blob": added}, submit.indexedThing[0])
assert.Equal(t, parentID+"_"+modified.Path, submit.indexedID[1])
assert.Equal(t, map[string]interface{}{"blob": modified}, submit.indexedThing[1])
assert.Equal(t, parentID+"_"+commit.SHA, submit.indexedID[2])
assert.Equal(t, map[string]interface{}{"commit": commit}, submit.indexedThing[2])
assert.Equal(t, parentID+"_"+removed.Path, submit.removedID[0])
assert.Equal(t, submit.flushed, 1)
}
func TestErrorIndexingSkipsRemainder(t *testing.T) {
idx, repo, submit := setupIndexer()
gitOKFile := gitFile("ok", "")
gitBreakingFile := gitFile("broken", "")
gitBreakingFile.Blob = readerFunc("", fmt.Errorf("Error"))
repo.added = append(repo.added, gitBreakingFile, gitOKFile)
err := idx.Index()
assert.Error(t, err)
assert.Equal(t, submit.indexed, 0)
assert.Equal(t, submit.removed, 0)
assert.Equal(t, submit.flushed, 0)
}
package indexer
import (
"time"
"gitlab.com/gitlab-org/es-git-go/git"
)
const (
elasticTimeFormat = "20060102T150405-0700"
)
type Person struct {
Name string `json:"name"`
Email string `json:"email"`
Time string `json:"time"` // %Y%m%dT%H%M%S%z
}
func GenerateDate(t time.Time) string {
return t.Format(elasticTimeFormat)
}
func BuildPerson(p git.Signature) *Person {
return &Person{
Name: tryEncodeString(p.Name),
Email: tryEncodeString(p.Email),
Time: GenerateDate(p.When),
}
}
package main_test
import (
"encoding/json"
"flag"
"fmt"
"os"
"os/exec"
"testing"
"time"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/elastic"
"gitlab.com/gitlab-org/es-git-go/indexer"
)
var (
binary = flag.String("binary", "./bin/es-git-go", "Path to `es-git-go` binary for integration tests")
testRepo = flag.String("test-repo", "./tmp/gitlab-test.git", "Path to `gitlab-test` repository for integration tests")
)
const (
projectID = "667"
headSHA = "b83d6e391c22777fca1ed3012fce84f633d7fed0"
)
func checkDeps(t *testing.T) {
if os.Getenv("ELASTIC_CONNECTION_INFO") == "" {
t.Log("ELASTIC_CONNECTION_INFO not set")
t.Skip()
}
if testing.Short() {
t.Log("Test run with -short, skipping integration test")
t.Skip()
}
if _, err := os.Stat(*binary); err != nil {
t.Log("No binary found at ", *binary)
t.Skip()
}
if _, err := os.Stat(*testRepo); err != nil {
t.Log("No test repo found at ", *testRepo)
t.Skip()
}
}
func buildIndex(t *testing.T) (*elastic.Client, func()) {
railsEnv := fmt.Sprintf("test-integration-%d", time.Now().Unix())
os.Setenv("RAILS_ENV", railsEnv)
client, err := elastic.FromEnv(projectID)
assert.NoError(t, err)
assert.NoError(t, client.CreateIndex())
return client, func() {
client.DeleteIndex()
}
}
func run(from, to string) error {
cmd := exec.Command(*binary, projectID, *testRepo)
cmd.Env = os.Environ()
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if from != "" {
cmd.Env = append(cmd.Env, "FROM_SHA="+from)
}
if to != "" {
cmd.Env = append(cmd.Env, "TO_SHA="+to)
}
return cmd.Run()
}
func TestIndexingRemovesFiles(t *testing.T) {
checkDeps(t)
c, td := buildIndex(t)
defer td()
// The commit before files/empty is removed - so it should be indexed
assert.NoError(t, run("", "19e2e9b4ef76b422ce1154af39a91323ccc57434"))
_, err := c.GetBlob("files/empty")
assert.NoError(t, err)
// Now we expect it to have been removed
assert.NoError(t, run("19e2e9b4ef76b422ce1154af39a91323ccc57434", "08f22f255f082689c0d7d39d19205085311542bc"))
_, err = c.GetBlob("files/empty")
assert.Error(t, err)
}
// Go source is defined to be UTF-8 encoded, so literals here are UTF-8
func TestIndexingTranscodesToUTF8(t *testing.T) {
checkDeps(t)
c, td := buildIndex(t)
defer td()
assert.NoError(t, run("", headSHA))
for _, tc := range []struct{
path string
expected string
} {
{"encoding/iso8859.txt", "狞\n"}, // GB18030
{"encoding/test.txt", "これはテストです。\nこれもマージして下さい。\n\nAdd excel file.\nDelete excel file."}, // SHIFT_JIS
} {
blob, err := c.GetBlob(tc.path)
assert.NoError(t, err)
blobDoc := make(map[string]*indexer.Blob)
assert.NoError(t, json.Unmarshal(*blob.Source, &blobDoc))
assert.Equal(t, tc.expected, blobDoc["blob"].Content)
}
}
func TestIndexingGitlabTest(t *testing.T) {
checkDeps(t)
c, td := buildIndex(t)
defer td()
assert.NoError(t, run("", headSHA))
// Check the indexing of a commit
commit, err := c.GetCommit(headSHA)
assert.NoError(t, err)
assert.True(t, commit.Found)
assert.Equal(t, "repository", commit.Type)
assert.Equal(t, projectID+"_"+headSHA, commit.Id)
assert.Equal(t, projectID, commit.Routing)
assert.Equal(t, projectID, commit.Parent)
doc := make(map[string]map[string]interface{})
assert.NoError(t, json.Unmarshal(*commit.Source, &doc))
commitDoc, ok := doc["commit"]
assert.True(t, ok)
assert.Equal(
t,
map[string]interface{}{
"type": "commit",
"sha": headSHA,
"author": map[string]interface{}{
"email": "job@gitlab.com",
"name": "Job van der Voort",
"time": "20160927T143746+0000",
},
"committer": map[string]interface{}{
"email": "job@gitlab.com",
"name": "Job van der Voort",
"time": "20160927T143746+0000",
},
"rid": projectID,
"message": "Merge branch 'branch-merged' into 'master'\r\n\r\nadds bar folder and branch-test text file to check Repository merged_to_root_ref method\r\n\r\n\r\n\r\nSee merge request !12",
},
commitDoc,
)
// Check the indexing of a text blob
blob, err := c.GetBlob("README.md")
assert.NoError(t, err)
assert.True(t, blob.Found)
assert.Equal(t, "repository", blob.Type)
assert.Equal(t, projectID+"_README.md", blob.Id)
assert.Equal(t, projectID, blob.Routing)
assert.Equal(t, projectID, blob.Parent)
doc = make(map[string]map[string]interface{})
assert.NoError(t, json.Unmarshal(*blob.Source, &doc))
blobDoc, ok := doc["blob"]
assert.True(t, ok)
assert.Equal(
t,
map[string]interface{}{
"type": "blob",
"language": "Markdown",
"path": "README.md",
"file_name": "README.md",
"oid": "faaf198af3a36dbf41961466703cc1d47c61d051",
"rid": projectID,
"commit_sha": headSHA,
"content": "testme\n======\n\nSample repo for testing gitlab features\n",
},
blobDoc,
)
// Check that a binary blob isn't indexed
_, err = c.GetBlob("Gemfile.zip")
assert.Error(t, err)
// Test that timezones are preserved
commit, err = c.GetCommit("498214de67004b1da3d820901307bed2a68a8ef6")
assert.NoError(t, err)
cDoc := make(map[string]*indexer.Commit)
assert.NoError(t, json.Unmarshal(*commit.Source, &cDoc))
assert.Equal(t, "20160921T161326+0100", cDoc["commit"].Author.Time)
assert.Equal(t, "20160921T161326+0100", cDoc["commit"].Committer.Time)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment