Skip to content
Snippets Groups Projects
Commit e57f8fe3 authored by Nick Thomas's avatar Nick Thomas
Browse files

Initial support for language detection

parent ec2d0762
No related branches found
No related tags found
1 merge request!1Initial implementation of an elasticsearch indexer in Go
Pipeline #
Loading
Loading
@@ -6,6 +6,8 @@ import (
"io/ioutil"
 
"srcd.works/go-git.v4/plumbing/object"
"gitlab.com/gitlab-org/es-git-go/linguist"
)
 
var (
Loading
Loading
@@ -93,17 +95,26 @@ func (i *Indexer) BuildBlob(file *object.File, commitSHA string) (*Blob, error)
Content: content,
Path: file.Name,
Filename: file.Name,
Language: DetectLanguage(b),
Language: DetectLanguage(file.Name, b),
}, nil
}
 
// FIXME: implement this
func DetectLanguage(data []byte) string {
// DetectLanguage returns a string describing the language of the file. This is
// programming language, rather than natural language.
//
// If no language is detected, "Text" is returned.
func DetectLanguage(filename string, data []byte) string {
lang := linguist.DetectLanguage(filename, data)
if lang != nil {
return lang.Name
}
return "Text"
}
 
// Check whether the passed-in data contains a NUL byte. Only scan the start of
// large blobs. This is the same test performed by git to check text/binary
// DetectBinary checks whether the passed-in data contains a NUL byte. Only scan
// the start of large blobs. This is the same test performed by git to check
// text/binary
func DetectBinary(data []byte) bool {
searchLimit := binarySearchLimit
if len(data) < searchLimit {
Loading
Loading
# Linguist
The Ruby [github-linguist](https://github.com/github/linguist) Gem is a useful
collection of helpers for language detection. This directory contains
reimplementations of parts of the functionality of that gem.
The file `languages.go` is programmatically generated from
[this file](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml).
The following license applies:
Copyright (c) 2017 GitHub, Inc.
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
require 'net/http'
require 'yaml'
def array(ary)
"[]string{#{ary.map(&:inspect).join(", ")}}"
end
def bool(value, default)
value = default if value.nil?
(!!value).inspect
end
def build_language(name, details)
out = []
out << ["Name", name.inspect]
out << ["Type", details['type'].inspect ] if details['type']
out << ["Group", details['group'].inspect ] if details['group']
out << ["Color", details['color'].inspect ] if details['color']
out << ["Aliases", array(details['aliases']) ] if details['aliases']
out << ["Extensions", array(details['extensions']) ] if details['extensions']
out << ["Filenames", array(details['filenames']) ] if details['filenames']
out << ["Interpreters", array(details['interpreters'])] if details['interpreters']
out << ["TmScope", details['tm_scope'].inspect ] if details['tm_scope']
out << ["AceMode", details['type'].inspect ] if details['ace_mode']
out << ["LanguageID", details['language_id'] ] if details['language_id']
# Two strange booleans
out << ["Wrap", bool(details['wrap'], false) ]
out << ["Searchable", bool(details['searchable'], true) ]
max_key = out.map {|k,v| k.size }.max
out = out.map do |k, v|
"\t\t\t#{k}:#{" " * (max_key - k.size)} #{v},"
end
"\t\t#{name.inspect}: &Language{\n#{out.join("\n")}\n\t\t},\n"
end
LANGUAGES_YML = URI.parse("https://raw.githubusercontent.com/github/linguist/v4.7.6/lib/linguist/languages.yml")
languages = YAML.load(Net::HTTP.get(LANGUAGES_YML))
f = File.open("languages.go", "w")
f.puts "package linguist"
f.puts ""
f.puts "var ("
f.puts "\tLanguages = map[string]*Language{"
languages.each {|name, details| f.puts build_language(name, details) }
f.puts "\t}"
f.puts ")"
f.close
package linguist
import (
"path"
)
// There's no YAML support in the Go stdlib, so use ruby instead.
// This will create a file `languages.go`, containing the Languages variable
//go:generate ruby generate_languages.go.rb
type Language struct {
Name string
Type string
Group string
Color string
Aliases []string
Extensions []string
Filenames []string
Interpreters []string
TmScope string
AceMode string
Wrap bool
Searchable bool
}
var (
languagesByExtension map[string][]*Language
languagesByFilename map[string][]*Language
)
func init() {
languagesByExtension = make(map[string][]*Language)
for _, lang := range Languages {
for _, ext := range lang.Extensions {
languagesByExtension[ext] = append(languagesByExtension[ext], lang)
}
}
languagesByFilename = make(map[string][]*Language)
for _, lang := range Languages {
for _, filename := range lang.Filenames {
languagesByFilename[filename] = append(languagesByFilename[filename], lang)
}
}
}
// and returns only the languges present in both A and B
func and(a, b []*Language) []*Language {
var out []*Language
for _, langA := range a {
for _, langB := range b {
if langA == langB {
out = append(out, langA)
}
}
}
return out
}
func DetectLanguageByFilename(filename string) []*Language {
return languagesByExtension[path.Base(filename)]
}
func DetectLanguageByExtension(filename string) []*Language {
return languagesByExtension[path.Ext(filename)]
}
func DetectLanguage(filename string, blob []byte) *Language {
// TODO: github-linguist uses a range of strategies not replicated here.
// It does the following:
//
// * modelines
// * shebangs
// * filename / extension (we have these)
// * heuristics
// * classifier
byFilename := DetectLanguageByFilename(filename)
if len(byFilename) == 1 {
return byFilename[0]
}
byExtension := DetectLanguageByExtension(filename)
if len(byFilename) > 1 {
byExtension = and(byFilename, byExtension)
}
if len(byExtension) > 0 {
return byExtension[0]
}
return nil
}
package linguist_test
import (
"testing"
"github.com/stretchr/testify/assert"
"gitlab.com/gitlab-org/es-git-go/linguist"
)
func TestCommonLanguagesAreDetectedByExtension(t *testing.T) {
type tc struct {
filename string
name string
}
for _, tc := range []struct {
file string
lang string
}{
{"foo.go", "Go"},
{".go", "Go"},
{"foo.go.rb", "Ruby"},
{"foo.rb", "Ruby"},
{"foo.c", "C"},
{"foo.cpp", "C++"},
{"/bar/foo.ini", "INI"},
{"bar/foo.ini", "INI"},
{"c:/foo.ini", "INI"},
{`c:\foo.ini`, "INI"},
{"foo.md", "Markdown"}, // Multiple possible languages
} {
langs := linguist.DetectLanguageByExtension(tc.file)
assert.Equal(t, 1, len(langs))
assert.Equal(t, tc.lang, langs[0].Name)
lang := linguist.DetectLanguage(tc.file, []byte{})
assert.NotNil(t, lang)
assert.Equal(t, tc.lang, lang.Name)
}
}
func TestImaginaryLanguageIsntRecognised(t *testing.T) {
lang := linguist.DetectLanguageByFilename("foo.absolutely-nobody-will-make-this-extension")
assert.Nil(t, lang)
}
// This test checks the content of languages.go against expectations chosen to
// validate the go:generate script
func TestAttributesAreCopiedCorrectly(t *testing.T) {
ada := linguist.Languages["Ada"]
assert.NotNil(t, ada)
cmake := linguist.Languages["CMake"]
assert.NotNil(t, cmake)
gettext := linguist.Languages["Gettext Catalog"]
assert.NotNil(t, gettext)
golang := linguist.Languages["Go"]
assert.NotNil(t, golang)
json := linguist.Languages["JSON"]
assert.NotNil(t, json)
markdown := linguist.Languages["Markdown"]
assert.NotNil(t, markdown)
ruby := linguist.Languages["Ruby"]
assert.NotNil(t, ruby)
assert.Equal(t, "Go", golang.Name)
assert.Equal(t, "programming", golang.Type)
assert.Equal(t, "JavaScript", json.Group)
assert.Equal(t, "#375eab", golang.Color)
assert.Equal(t, []string{"ada95", "ada2005"}, ada.Aliases)
assert.Equal(t, []string{".go"}, golang.Extensions)
assert.Equal(t, []string{"CMakeLists.txt"}, cmake.Filenames)
assert.Equal(t, []string{"ruby", "macruby", "rake", "jruby", "rbx"}, ruby.Interpreters)
assert.Equal(t, "source.gfm", markdown.TmScope)
assert.Equal(t, "programming", golang.AceMode)
assert.Equal(t, false, gettext.Searchable)
assert.Equal(t, true, markdown.Wrap)
}
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment