Commit d3a8548ed2a2e663dd82afe9dd69629cbddca7cb

Authored by James McMinn
1 parent a307d82a1b
Exists in master

Been a while apparently...

Showing 20 changed files with 197 additions and 22 deletions Side-by-side Diff

filter/filter.go 100644 → 100755
File mode changed
filter/lengthfilter.go 100644 → 100755
File mode changed
filter/nonalphanumericfilter.go 100644 → 100755
File mode changed
filter/stopwordfilter.go
1 1 package filter
2 2  
3 3 import (
  4 + "git.scoopanalytics.io/core/ir-tools"
4 5 "log"
5   - "mirgit.dcs.gla.ac.uk/JamesMcMinn/ir-tools"
6 6 )
7 7  
8 8 type StopWordFilter struct {
9 9 stopwords map[string]bool
10 10 }
11 11  
12   -func NewStopWordFilter(stopwordFile string) *StopWordFilter {
13   - terms, err := irtools.ReadLines(stopwordFile)
14   -
15   - if err != nil {
16   - log.Fatal(err)
17   - }
  12 +func NewStopWordFilter(stopwordFiles ...string) *StopWordFilter {
18 13 stopwords := map[string]bool{}
19 14  
20   - for _, term := range terms {
21   - stopwords[term] = true
  15 + for _, filename := range stopwordFiles {
  16 + terms, err := irtools.ReadLines(filename)
  17 +
  18 + if err != nil {
  19 + log.Fatal(err)
  20 + }
  21 +
  22 + for _, term := range terms {
  23 + stopwords[term] = true
  24 + }
22 25 }
23 26  
24 27 return &StopWordFilter{stopwords}
... ... @@ -34,3 +37,8 @@ func (filter *StopWordFilter) Filter(input []string) (output []string) {
34 37 }
35 38 return output
36 39 }
  40 +
  41 +func (filter *StopWordFilter) IsStopWord(test string) bool {
  42 + _, found := filter.stopwords[test]
  43 + return found
  44 +}
filter/urlfilter.go 100644 → 100755
File mode changed
formats/csv.go 100644 → 100755
File mode changed
index/document.go 100644 → 100755
File mode changed
... ... @@ -10,6 +10,7 @@ type Sortable interface {
10 10 }
11 11  
12 12 type Integer64 int64
  13 +type Integer32 int32
13 14 type Integer int
14 15 type Float64 float64
15 16  
... ... @@ -26,6 +27,19 @@ func (v Integer64) Compare(to interface{}) int {
26 27 func (v Integer64) GT(to interface{}) bool { return v > to.(Integer64) }
27 28 func (v Integer64) LT(to interface{}) bool { return v < to.(Integer64) }
28 29  
  30 +func (v Integer32) Compare(to interface{}) int {
  31 + t := to.(Integer32)
  32 + if v.GT(t) {
  33 + return 1
  34 + }
  35 + if v.LT(t) {
  36 + return -1
  37 + }
  38 + return 0
  39 +}
  40 +func (v Integer32) GT(to interface{}) bool { return v > to.(Integer32) }
  41 +func (v Integer32) LT(to interface{}) bool { return v < to.(Integer32) }
  42 +
29 43 func (v Integer) Compare(to interface{}) int {
30 44 t := to.(Integer)
31 45 if v.GT(t) {
... ... @@ -90,27 +104,36 @@ func (m *SortedMap) Contains(item interface{}) bool {
90 104 }
91 105  
92 106 func (m *SortedMap) fixOrder(position int) {
93   - currentPos := position
94 107  
95   - if prevPos := currentPos - 1; prevPos >= 0 {
  108 + prevPos := position
  109 + for prevPos >= 1 {
  110 + currentPos := prevPos
  111 + prevPos = currentPos - 1
  112 +
96 113 curr := m.order[currentPos]
97 114 prev := m.order[prevPos]
98 115 if m.data[curr].LT(m.data[prev]) {
99 116 m.order[currentPos], m.order[prevPos] = m.order[prevPos], m.order[currentPos]
100 117 m.ord[m.order[currentPos]] = currentPos
101 118 m.ord[m.order[prevPos]] = prevPos
102   - m.fixOrder(prevPos)
  119 + } else {
  120 + break
103 121 }
104 122 }
105 123  
106   - if nextPos := currentPos + 1; nextPos < len(m.ord) {
  124 + nextPos := position
  125 + for (nextPos + 1) < len(m.ord) {
  126 + currentPos := nextPos
  127 + nextPos := currentPos + 1
  128 +
107 129 curr := m.order[currentPos]
108 130 next := m.order[nextPos]
109 131 if m.data[curr].GT(m.data[next]) {
110 132 m.order[currentPos], m.order[nextPos] = m.order[nextPos], m.order[currentPos]
111 133 m.ord[m.order[currentPos]] = currentPos
112 134 m.ord[m.order[nextPos]] = nextPos
113   - m.fixOrder(nextPos)
  135 + } else {
  136 + break
114 137 }
115 138 }
116 139 }
parse/parser_test.go
... ... @@ -0,0 +1,79 @@
  1 +package parse
  2 +
  3 +import (
  4 + "strings"
  5 + "testing"
  6 +)
  7 +
  8 +// Perform deep parsing a sentence
  9 +func parseSentence(sentence string) (parsedSentence string, err error) {
  10 + // Strip trailing hashtags
  11 + for {
  12 + sentence = strings.TrimRight(sentence, " #")
  13 + lastSpace := strings.LastIndex(sentence, " ")
  14 + lastHash := strings.LastIndex(sentence, "#")
  15 + if lastSpace > 0 && lastSpace < lastHash {
  16 + sentence = sentence[:lastHash]
  17 + } else {
  18 + break
  19 + }
  20 + }
  21 +
  22 + // Strip trailing mentions
  23 + for {
  24 + lastSpace := strings.LastIndex(sentence, " ")
  25 + lastMention := strings.LastIndex(sentence, "@")
  26 + if lastSpace > 0 && lastSpace < lastMention {
  27 + sentence = sentence[:lastMention]
  28 + } else {
  29 + break
  30 + }
  31 + }
  32 +
  33 + // Strip trailing sub-sections (i.e this)
  34 + for {
  35 + lastSpace := strings.LastIndex(sentence, " ")
  36 + lastParenthesis := strings.LastIndex(sentence, "(")
  37 + if lastSpace > 0 && lastSpace < lastParenthesis {
  38 + sentence = sentence[:lastParenthesis]
  39 + } else {
  40 + break
  41 + }
  42 + }
  43 +
  44 + // Strip leading hashtags
  45 + firstSpace := strings.IndexAny(sentence, " ")
  46 + firstHash := strings.IndexAny(sentence, "#")
  47 + if firstHash >= 0 && firstHash < firstSpace {
  48 + sentence = sentence[firstSpace+1:]
  49 + }
  50 +
  51 + // Strip "via" text (e.g. Some Tech News via the Verge)
  52 + via := strings.Index(sentence, " via ")
  53 + if via > 0 {
  54 + sentence = sentence[:via+1]
  55 + }
  56 +
  57 + // Strip common prefixes
  58 + if strings.HasPrefix(sentence, "BBC News UK") {
  59 + sentence = sentence[len("BBC News UK"):]
  60 + } else if strings.HasPrefix(sentence, "US News") {
  61 + sentence = sentence[len("US News"):]
  62 + } else if strings.HasPrefix(sentence, "World News") {
  63 + sentence = sentence[len("World News"):]
  64 + }
  65 +
  66 + sentence = strings.TrimSpace(sentence)
  67 +
  68 + return sentence, nil
  69 +}
  70 +
  71 +func TestSentenceTokenizer(t *testing.T) {
  72 + sentence, _ := parseSentence("Man U Team v Crystal Palace: De Gea, Valencia, McNair, Blind, Shaw, Fellaini, Carrick, Januzaj, Rooney, Di Maria, van Persie#test")
  73 +
  74 + st := NewSentenceTokenizer(sentence)
  75 + longest, _ := st.Longest()
  76 + if longest != "De Gea, Valencia, McNair, Blind, Shaw, Fellaini, Carrick, Januzaj, Rooney, Di Maria, van Persie" {
  77 + t.Error("Expected " + sentence + " got " + longest)
  78 + }
  79 +}
parse/sentencetokenizer.go
... ... @@ -104,7 +104,7 @@ func (t *SentenceTokenizer) isDelimiter(pos int) (bool, bool) {
104 104 }
105 105  
106 106 // Part of an Acronym?
107   - if ((pos+3) <= len(t.runes) && t.runes[pos+2] == '.' && t.runes[pos+1] != '.') || (pos > 2 && t.runes[pos-2] == '.' && t.runes[pos-1] != '.') {
  107 + if ((pos+3) <= len(t.runes) && (t.runes[pos+2] == '.' || t.runes[pos+2] == ' ' || t.runes[pos+2] == ',') && t.runes[pos+1] != '.') || (pos > 2 && t.runes[pos-2] == '.' && t.runes[pos-1] != '.') {
108 108 return false, false
109 109 }
110 110  
... ... @@ -113,12 +113,24 @@ func (t *SentenceTokenizer) isDelimiter(pos int) (bool, bool) {
113 113 return false, false
114 114 }
115 115  
  116 + if (pos >= 2 && t.runes[pos-2] == ' ') || pos == 1 {
  117 + abrv := strings.ToLower(string(t.runes[pos-1 : pos]))
  118 + // name?
  119 + if abrv == "a" || abrv == "b" || abrv == "c" || abrv == "d" || abrv == "e" || abrv == "f" ||
  120 + abrv == "g" || abrv == "h" || abrv == "i" || abrv == "j" || abrv == "k" || abrv == "l" ||
  121 + abrv == "m" || abrv == "n" || abrv == "o" || abrv == "p" || abrv == "q" || abrv == "r" ||
  122 + abrv == "s" || abrv == "t" || abrv == "u" || abrv == "v" || abrv == "w" || abrv == "x" ||
  123 + abrv == "y" || abrv == "z" {
  124 + return false, false
  125 + }
  126 + }
  127 +
116 128 if (pos >= 3 && t.runes[pos-3] == ' ') || pos == 2 {
117 129 abrv := strings.ToLower(string(t.runes[pos-2 : pos]))
118 130 // Common abberviation? St. Vs. Mr. Dr. Ms.
119 131 if abrv == "mr" || abrv == "st" || abrv == "vs" || abrv == "dr" ||
120 132 abrv == "ms" || abrv == "no" || abrv == "yr" || abrv == "jr" ||
121   - abrv == "rs" || abrv == "co" {
  133 + abrv == "rs" || abrv == "co" || abrv == "rs" || abrv == "sq" {
122 134 return false, false
123 135 }
124 136 }
... ... @@ -129,7 +141,8 @@ func (t *SentenceTokenizer) isDelimiter(pos int) (bool, bool) {
129 141 if abrv == "jan" || abrv == "feb" || abrv == "mar" || abrv == "apr" ||
130 142 abrv == "jun" || abrv == "jul" || abrv == "aug" ||
131 143 abrv == "sep" || abrv == "oct" || abrv == "nov" || abrv == "dec" ||
132   - abrv == "mrs" || abrv == "man" || abrv == "rep" {
  144 + abrv == "mrs" || abrv == "man" || abrv == "rep" || abrv == "gov" ||
  145 + abrv == "ltd" || abrv == "inc" || abrv == "cpl" {
133 146 return false, false
134 147 }
135 148 }
... ... @@ -173,13 +186,17 @@ func (t *SentenceTokenizer) isDelimiter(pos int) (bool, bool) {
173 186 }
174 187  
175 188 case ':':
176   - return len(t.tokens) > 0 || (pos+1) == len(t.runes) || pos == 0 || t.runes[pos+1] == ' ' || t.runes[pos-1] == ' ', false
  189 + if len(t.tokens) > 0 || (pos+1) == len(t.runes) || pos == 0 ||
  190 + (t.runes[pos+1] == ' ' && ((pos+2) < (len(t.runes)) && t.runes[pos+2] != '"' && t.runes[pos+2] != '\'')) ||
  191 + t.runes[pos-1] == ' ' {
  192 + return true, false
  193 + }
177 194  
178 195 case ',':
179 196 return t.comma && ((pos+1) == len(t.runes) || t.runes[pos+1] == ' '), false
180 197  
181 198 case '"', '”', '“':
182   - if len(t.runes) > pos+1 && t.runes[pos+1] == '@' {
  199 + if pos > 0 && len(t.runes) > pos+1 && t.runes[pos+1] == '@' {
183 200 return true, false
184 201 } else {
185 202 return false, false
... ... @@ -191,7 +208,7 @@ func (t *SentenceTokenizer) isDelimiter(pos int) (bool, bool) {
191 208 case ')':
192 209 return pos == 0 || t.runes[pos-1] == ' ', false
193 210  
194   - case ' ', '$', '£', '€', '&', '%', '*', '#', '@', '~', '’', '\'', '‘', '/':
  211 + case ' ', '$', '£', '€', '&', '%', '*', '#', '@', '~', '’', '\'', '‘', '/', '′':
195 212 return false, false
196 213  
197 214 default:
parse/tokenizer.go 100644 → 100755
File mode changed
parse/tweettokenizer.go
... ... @@ -118,7 +118,7 @@ func (t *TweetTokenzier) isDelimiter(pos int) bool {
118 118 case ' ', '\t', '\n':
119 119 return true
120 120  
121   - case ']', '[', '!', '"', '$', '%', '&', '(', ')', '*', '+', '/',
  121 + case ']', '[', '!', '"', '$', '%', '(', ')', '*', '+', '/',
122 122 ';', '<', '>', '=', '?', '\\', '^', '_', '{', '}', '|', '~', '-', '¬', '·':
123 123 return t.currentTokenURL != true
124 124  
parse/whitespacetokenizer.go 100644 → 100755
File mode changed
similarity/similarity.go 100644 → 100755
File mode changed
similarity/similarity_test.go 100644 → 100755
File mode changed
... ... @@ -15,6 +15,36 @@ func Count(terms []string) (counted map[string]int) {
15 15 return counted
16 16 }
17 17  
  18 +func Overlap(c1, c2 map[string]int) (int, []string) {
  19 + overlapCount := 0
  20 + overlapping := []string{}
  21 + if len(c2) > len(c1) {
  22 + c1, c2 = c2, c1
  23 + }
  24 + for k, _ := range c1 {
  25 + if _, found := c2[k]; found {
  26 + overlapCount++
  27 + overlapping = append(overlapping, k)
  28 + }
  29 + }
  30 + return overlapCount, overlapping
  31 +}
  32 +
  33 +func OverlapInt16(c1, c2 map[string]int16) (int, []string) {
  34 + overlapCount := 0
  35 + overlapping := []string{}
  36 + if len(c2) > len(c1) {
  37 + c1, c2 = c2, c1
  38 + }
  39 + for k, _ := range c1 {
  40 + if _, found := c2[k]; found {
  41 + overlapCount++
  42 + overlapping = append(overlapping, k)
  43 + }
  44 + }
  45 + return overlapCount, overlapping
  46 +}
  47 +
18 48 // Take two term frequency maps add the source map to the destination map
19 49 func Combine(dest, source *map[string]int) {
20 50 d := *dest
... ... @@ -23,6 +53,14 @@ func Combine(dest, source *map[string]int) {
23 53 }
24 54 }
25 55  
  56 +// Take two term frequency maps add the source map to the destination map
  57 +func CombineInt16(dest, source *map[string]int16) {
  58 + d := *dest
  59 + for k, v := range *source {
  60 + d[k] += v
  61 + }
  62 +}
  63 +
26 64 // Counts the number of terms in m2 which are not in m1
27 65 func Difference(m1, m2 map[string]int) (count int, difference []string) {
28 66 for k, _ := range m2 {
... ... @@ -33,6 +71,16 @@ func Difference(m1, m2 map[string]int) (count int, difference []string) {
33 71 return len(difference), difference
34 72 }
35 73  
  74 +// Counts the number of terms in m2 which are not in m1
  75 +func DifferenceInt16(m1, m2 map[string]int16) (count int, difference []string) {
  76 + for k, _ := range m2 {
  77 + if _, ok := m1[k]; ok != true {
  78 + difference = append(difference, k)
  79 + }
  80 + }
  81 + return len(difference), difference
  82 +}
  83 +
36 84 // readLines reads a whole file into memory
37 85 // and returns a slice of its lines.
38 86 func ReadLines(path string) ([]string, error) {
... ... @@ -60,7 +108,7 @@ func URLFilter(text string) (clean string, urls []string) {
60 108 start = strings.Index(clean, "https://")
61 109 }
62 110 if start >= 0 {
63   - end := strings.Index(clean[start:], " . ")
  111 + end := strings.Index(clean[start:], " ")
64 112 if end == -1 {
65 113 urls = append(urls, clean[start:])
66 114 clean = clean[:start]
transform/alphanumerictransform.go 100644 → 100755
File mode changed
transform/lowercasetransform.go 100644 → 100755
File mode changed
transform/porterstemmertransform.go 100644 → 100755
File mode changed
transform/transform.go 100644 → 100755
File mode changed