Commit a307d82a1b71172c90d21b465bd7f6715a6486af

Authored by James McMinn
1 parent 35333f85d9
Exists in master

Few small changes to sentence tokenizer, coverted the sortedmap to use interfaces

Showing 4 changed files with 95 additions and 44 deletions Side-by-side Diff

... ... @@ -3,23 +3,69 @@ A very simply sorted map.
3 3 */
4 4 package index
5 5  
6   -type key string
7   -type val float64
  6 +type Sortable interface {
  7 + GT(to interface{}) bool
  8 + LT(to interface{}) bool
  9 + Compare(to interface{}) int
  10 +}
  11 +
  12 +type Integer64 int64
  13 +type Integer int
  14 +type Float64 float64
  15 +
  16 +func (v Integer64) Compare(to interface{}) int {
  17 + t := to.(Integer64)
  18 + if v.GT(t) {
  19 + return 1
  20 + }
  21 + if v.LT(t) {
  22 + return -1
  23 + }
  24 + return 0
  25 +}
  26 +func (v Integer64) GT(to interface{}) bool { return v > to.(Integer64) }
  27 +func (v Integer64) LT(to interface{}) bool { return v < to.(Integer64) }
  28 +
  29 +func (v Integer) Compare(to interface{}) int {
  30 + t := to.(Integer)
  31 + if v.GT(t) {
  32 + return 1
  33 + }
  34 + if v.LT(t) {
  35 + return -1
  36 + }
  37 + return 0
  38 +}
  39 +func (v Integer) GT(to interface{}) bool { return v > to.(Integer) }
  40 +func (v Integer) LT(to interface{}) bool { return v < to.(Integer) }
  41 +
  42 +func (v Float64) Compare(to interface{}) int {
  43 + t := to.(Float64)
  44 + if v.GT(t) {
  45 + return 1
  46 + }
  47 + if v.LT(t) {
  48 + return -1
  49 + }
  50 + return 0
  51 +}
  52 +func (v Float64) GT(to interface{}) bool { return v > to.(Float64) }
  53 +func (v Float64) LT(to interface{}) bool { return v < to.(Float64) }
8 54  
9 55 type SortedMap struct {
10   - data map[key]val
11   - order []key
12   - ord map[key]int
  56 + data map[interface{}]Sortable
  57 + order []interface{}
  58 + ord map[interface{}]int
13 59 }
14 60  
15 61 func NewSortedMap() *SortedMap {
16 62 m := new(SortedMap)
17   - m.data = make(map[key]val)
18   - m.ord = make(map[key]int)
  63 + m.data = make(map[interface{}]Sortable)
  64 + m.ord = make(map[interface{}]int)
19 65 return m
20 66 }
21 67  
22   -func (m *SortedMap) Put(item key, value val) {
  68 +func (m *SortedMap) Put(item interface{}, value Sortable) {
23 69 if _, ok := m.data[item]; ok == false {
24 70 m.order = append(m.order, item)
25 71 m.ord[item] = len(m.ord)
... ... @@ -28,7 +74,7 @@ func (m *SortedMap) Put(item key, value val) {
28 74 m.fixOrder(m.ord[item])
29 75 }
30 76  
31   -func (m *SortedMap) Remove(item key) {
  77 +func (m *SortedMap) Remove(item interface{}) {
32 78 pos := m.ord[item]
33 79 delete(m.ord, item)
34 80 for _, v := range m.order[pos+1:] {
... ... @@ -38,13 +84,18 @@ func (m *SortedMap) Remove(item key) {
38 84 delete(m.data, item)
39 85 }
40 86  
  87 +func (m *SortedMap) Contains(item interface{}) bool {
  88 + _, contains := m.data[item]
  89 + return contains
  90 +}
  91 +
41 92 func (m *SortedMap) fixOrder(position int) {
42 93 currentPos := position
43 94  
44 95 if prevPos := currentPos - 1; prevPos >= 0 {
45 96 curr := m.order[currentPos]
46 97 prev := m.order[prevPos]
47   - if m.data[curr] < m.data[prev] {
  98 + if m.data[curr].LT(m.data[prev]) {
48 99 m.order[currentPos], m.order[prevPos] = m.order[prevPos], m.order[currentPos]
49 100 m.ord[m.order[currentPos]] = currentPos
50 101 m.ord[m.order[prevPos]] = prevPos
... ... @@ -55,7 +106,7 @@ func (m *SortedMap) fixOrder(position int) {
55 106 if nextPos := currentPos + 1; nextPos < len(m.ord) {
56 107 curr := m.order[currentPos]
57 108 next := m.order[nextPos]
58   - if m.data[curr] > m.data[next] {
  109 + if m.data[curr].GT(m.data[next]) {
59 110 m.order[currentPos], m.order[nextPos] = m.order[nextPos], m.order[currentPos]
60 111 m.ord[m.order[currentPos]] = currentPos
61 112 m.ord[m.order[nextPos]] = nextPos
... ... @@ -64,30 +115,10 @@ func (m *SortedMap) fixOrder(position int) {
64 115 }
65 116 }
66 117  
67   -func (m *SortedMap) Dec(item key, amount val) (newValue val) {
68   - if _, ok := m.data[item]; ok == false {
69   - m.Put(item, 0-amount)
70   - return amount
71   - }
72   - m.data[item] = m.data[item] - amount
73   - m.fixOrder(m.ord[item])
74   - return m.data[item]
75   -}
76   -
77   -func (m *SortedMap) Inc(item key, amount val) (newValue val) {
78   - if _, ok := m.data[item]; ok == false {
79   - m.Put(item, amount)
80   - return amount
81   - }
82   - m.data[item] = m.data[item] + amount
83   - m.fixOrder(m.ord[item])
84   - return m.data[item]
85   -}
86   -
87   -func (m *SortedMap) Get(item key) val {
  118 +func (m *SortedMap) Get(item interface{}) Sortable {
88 119 return m.data[item]
89 120 }
90 121  
91   -func (m *SortedMap) OrderedKeys() []key {
  122 +func (m *SortedMap) OrderedKeys() []interface{} {
92 123 return m.order
93 124 }
parse/sentencetokenizer.go
... ... @@ -71,7 +71,11 @@ func (t *SentenceTokenizer) nextToken() (token string, err error) {
71 71 token = string(t.runes[t.position:i])
72 72 t.position = i + 1
73 73 break
74   - } else if i == (len(t.runes)-1) || keep { // We're at the last char and it's not a delimiter, so we must include it
  74 + } else if i == (len(t.runes) - 1) { // We're at the last char and it's not a delimiter, so we must include it
  75 + token = string(t.runes[t.position : i+1])
  76 + t.position = i + 1
  77 + break
  78 + } else if delimiter && keep { // We're at the last char and it's not a delimiter, so we must include it
75 79 token = string(t.runes[t.position : i+1])
76 80 t.position = i + 1
77 81 break
... ... @@ -91,8 +95,12 @@ func (t *SentenceTokenizer) isDelimiter(pos int) (bool, bool) {
91 95  
92 96 if c == '.' {
93 97 //Part of an ellipsis?
94   - if len(t.runes) > (pos+1) && t.runes[pos+1] == '.' {
  98 + if pos+2 < len(t.runes) && string(t.runes[pos:pos+3]) == "..." {
95 99 return false, false
  100 + } else if pos+1 < len(t.runes) && pos > 0 && t.runes[pos-1] == '.' && t.runes[pos+1] == '.' {
  101 + return false, false
  102 + } else if pos > 2 && string(t.runes[pos-2:pos+1]) == "..." {
  103 + return true, true
96 104 }
97 105  
98 106 // Part of an Acronym?
... ... @@ -121,7 +129,7 @@ func (t *SentenceTokenizer) isDelimiter(pos int) (bool, bool) {
121 129 if abrv == "jan" || abrv == "feb" || abrv == "mar" || abrv == "apr" ||
122 130 abrv == "jun" || abrv == "jul" || abrv == "aug" ||
123 131 abrv == "sep" || abrv == "oct" || abrv == "nov" || abrv == "dec" ||
124   - abrv == "mrs" || abrv == "man" {
  132 + abrv == "mrs" || abrv == "man" || abrv == "rep" {
125 133 return false, false
126 134 }
127 135 }
... ... @@ -146,7 +154,7 @@ func (t *SentenceTokenizer) isDelimiter(pos int) (bool, bool) {
146 154  
147 155 switch c {
148 156 case '!', '?', '…':
149   - return true, true
  157 + return pos+1 == len(t.runes) || t.runes[pos+1] != '"', true
150 158  
151 159 case '|', '·', '–', '—', '►', '[', ']', ';', '>', '<', '^', '+', '=':
152 160 return true, false
... ... @@ -171,9 +179,19 @@ func (t *SentenceTokenizer) isDelimiter(pos int) (bool, bool) {
171 179 return t.comma && ((pos+1) == len(t.runes) || t.runes[pos+1] == ' '), false
172 180  
173 181 case '"', '”', '“':
174   - return true, true
  182 + if len(t.runes) > pos+1 && t.runes[pos+1] == '@' {
  183 + return true, false
  184 + } else {
  185 + return false, false
  186 + }
  187 +
  188 + case '(':
  189 + return (pos+1) == len(t.runes) || t.runes[pos+1] == ' ', false
  190 +
  191 + case ')':
  192 + return pos == 0 || t.runes[pos-1] == ' ', false
175 193  
176   - case ' ', '$', '£', '€', '&', '%', '*', '#', '@', '~', '’', '\'', '‘':
  194 + case ' ', '$', '£', '€', '&', '%', '*', '#', '@', '~', '’', '\'', '‘', '/':
177 195 return false, false
178 196  
179 197 default:
parse/tweettokenizer.go
... ... @@ -102,8 +102,6 @@ func (t *TweetTokenzier) nextToken() (token string, entity bool, err error) {
102 102 token = "EIGHT"
103 103 } else if token == "9" {
104 104 token = "NINE"
105   - } else if token == "000" {
106   - token = "k"
107 105 }
108 106  
109 107 if token == "" && t.position >= len(t.runes)-1 {
transform/porterstemmertransform.go
... ... @@ -13,9 +13,13 @@ func NewPorterStemmer() *PorterStemmer {
13 13 return &PorterStemmer{porter2.Stemmer}
14 14 }
15 15  
16   -func (transform *PorterStemmer) Apply(input []string) (output []string) {
17   - for i := range input {
18   - output = append(output, transform.stemmer.Stem(input[i]))
  16 +func (transform *PorterStemmer) ApplyAll(input []string) (output []string) {
  17 + for _, v := range input {
  18 + output = append(output, transform.Apply(v))
19 19 }
20 20 return output
21 21 }
  22 +
  23 +func (transform *PorterStemmer) Apply(input string) (output string) {
  24 + return transform.stemmer.Stem(input)
  25 +}