Commit 35333f85d9e98f02e0c2137235f716b58de83cce

Authored by James McMinn
1 parent ab36419bbd
Exists in master

Switched to using runes for better uncode support. Vastly improved sentence and tweet tokenziers

Showing 6 changed files with 160 additions and 33 deletions Side-by-side Diff

filter/lengthfilter.go
1 1 package filter
2 2  
  3 +import (
  4 + "strconv"
  5 +)
  6 +
3 7 type LengthFilter struct {
4   - minLength int
  8 + minLength int
  9 + allowNumbers bool
5 10 }
6 11  
7   -func NewLengthFilter(minLength int) *LengthFilter {
8   - return &LengthFilter{minLength}
  12 +func NewLengthFilter(minLength int, allowNumbers bool) *LengthFilter {
  13 + return &LengthFilter{minLength, allowNumbers}
9 14 }
10 15  
11 16 func (filter *LengthFilter) Filter(input []string) (output []string) {
12 17 for i := range input {
13   - if len(input[i]) < filter.minLength {
  18 + // Is it a number?
  19 + _, err := strconv.Atoi(input[i])
  20 +
  21 + if len(input[i]) < filter.minLength && (filter.allowNumbers == false || err != nil) {
14 22 continue
15 23 }
16 24  
... ... @@ -21,6 +21,7 @@ func ReadCSVFile(path string, separator rune) (reader *CSVReader, err error) {
21 21 reader.Reader = csv.NewReader(file)
22 22 reader.Reader.Comma = separator
23 23 reader.Reader.LazyQuotes = true
  24 + reader.Reader.FieldsPerRecord = len(reader.Headings)
24 25  
25 26 return reader, nil
26 27 }
index/invertedindex.go
... ... @@ -53,7 +53,7 @@ func (i *Inverted) IDF(n string) float64 {
53 53 if docs != nil {
54 54 l = docs.Len()
55 55 }
56   - return math.Log2(float64(i.NumDocuments+1) / float64(l+1))
  56 + return math.Log2(float64(i.NumDocuments) / float64(l+1))
57 57 }
58 58  
59 59 func (i *Inverted) AddDocument(d Doc) {
... ... @@ -85,3 +85,11 @@ func (i *Inverted) RemoveDocument(d Doc) {
85 85  
86 86 i.NumDocuments--
87 87 }
  88 +
  89 +func (i *Inverted) GetKeys() (keys []string) {
  90 + keys = make([]string, 0, len(i.index))
  91 + for k := range i.index {
  92 + keys = append(keys, k)
  93 + }
  94 + return keys
  95 +}
parse/sentencetokenizer.go
... ... @@ -5,6 +5,7 @@ package parse
5 5 import (
6 6 "io"
7 7 "strings"
  8 + "unicode"
8 9 )
9 10  
10 11 type SentenceTokenizer struct {
... ... @@ -12,15 +13,24 @@ type SentenceTokenizer struct {
12 13 tokens []string
13 14 position int
14 15 comma bool
  16 + runes []rune
15 17 }
16 18  
17 19 func NewSentenceTokenizer(text string) (tokenizer *SentenceTokenizer) {
18 20 text = strings.Replace(text, "\n", " ", -1)
  21 + n := 0
  22 + runes := make([]rune, len(text))
  23 + for _, r := range text {
  24 + runes[n] = r
  25 + n++
  26 + }
  27 + runes = runes[0:n]
19 28 tokenizer = &SentenceTokenizer{
20 29 text: text,
21 30 tokens: []string{},
22 31 position: 0,
23 32 comma: false,
  33 + runes: runes,
24 34 }
25 35 return tokenizer
26 36 }
... ... @@ -53,22 +63,22 @@ func (t *SentenceTokenizer) Longest() (longest string, length int) {
53 63 }
54 64  
55 65 func (t *SentenceTokenizer) nextToken() (token string, err error) {
56   - for i := t.position; i < len(t.text); i++ {
  66 + for i := t.position; i < len(t.runes); i++ {
57 67 // no new token if the first char after the last delimiter is another delimiter
58   - if delimiter := t.isDelimiter(i); delimiter && t.position == i {
  68 + if delimiter, keep := t.isDelimiter(i); delimiter && t.position == i {
59 69 t.position++
60   - } else if delimiter {
61   - token = t.text[t.position:i]
  70 + } else if delimiter && !keep {
  71 + token = string(t.runes[t.position:i])
62 72 t.position = i + 1
63 73 break
64   - } else if i == (len(t.text) - 1) { // We're at the last char and it's not a delimiter, so we must include it
65   - token = t.text[t.position : i+1]
  74 + } else if i == (len(t.runes)-1) || keep { // We're at the last char and it's not a delimiter, so we must include it
  75 + token = string(t.runes[t.position : i+1])
66 76 t.position = i + 1
67 77 break
68 78 }
69 79 }
70 80  
71   - if token == "" && t.position >= len(t.text)-1 {
  81 + if token == "" && t.position >= len(t.runes)-1 {
72 82 return token, io.EOF
73 83 }
74 84  
... ... @@ -76,22 +86,100 @@ func (t *SentenceTokenizer) nextToken() (token string, err error) {
76 86 return token, nil
77 87 }
78 88  
79   -func (t *SentenceTokenizer) isDelimiter(pos int) bool {
80   - c := t.text[pos]
  89 +func (t *SentenceTokenizer) isDelimiter(pos int) (bool, bool) {
  90 + c := t.runes[pos]
  91 +
  92 + if c == '.' {
  93 + //Part of an ellipsis?
  94 + if len(t.runes) > (pos+1) && t.runes[pos+1] == '.' {
  95 + return false, false
  96 + }
  97 +
  98 + // Part of an Acronym?
  99 + if ((pos+3) <= len(t.runes) && t.runes[pos+2] == '.' && t.runes[pos+1] != '.') || (pos > 2 && t.runes[pos-2] == '.' && t.runes[pos-1] != '.') {
  100 + return false, false
  101 + }
  102 +
  103 + // End of a quote?
  104 + if len(t.runes) > (pos+1) && t.runes[pos+1] == '"' {
  105 + return false, false
  106 + }
  107 +
  108 + if (pos >= 3 && t.runes[pos-3] == ' ') || pos == 2 {
  109 + abrv := strings.ToLower(string(t.runes[pos-2 : pos]))
  110 + // Common abberviation? St. Vs. Mr. Dr. Ms.
  111 + if abrv == "mr" || abrv == "st" || abrv == "vs" || abrv == "dr" ||
  112 + abrv == "ms" || abrv == "no" || abrv == "yr" || abrv == "jr" ||
  113 + abrv == "rs" || abrv == "co" {
  114 + return false, false
  115 + }
  116 + }
  117 +
  118 + if (pos >= 4 && t.runes[pos-4] == ' ') || pos == 3 {
  119 + abrv := strings.ToLower(string(t.runes[pos-3 : pos]))
  120 + // Common abberviation? jan, feb, mar, mrs
  121 + if abrv == "jan" || abrv == "feb" || abrv == "mar" || abrv == "apr" ||
  122 + abrv == "jun" || abrv == "jul" || abrv == "aug" ||
  123 + abrv == "sep" || abrv == "oct" || abrv == "nov" || abrv == "dec" ||
  124 + abrv == "mrs" || abrv == "man" {
  125 + return false, false
  126 + }
  127 + }
  128 +
  129 + if (pos >= 5 && t.runes[pos-5] == ' ') || pos == 4 {
  130 + abrv := strings.ToLower(string(t.runes[pos-4 : pos]))
  131 + // Common abberviation? jan, feb, mar, mrs
  132 + if abrv == "janu" || abrv == "febu" || abrv == "marc" || abrv == "apri" ||
  133 + abrv == "aug" || abrv == "sept" || abrv == "octo" || abrv == "nove" ||
  134 + abrv == "dece" || abrv == "prof" || abrv == "pres" {
  135 + return false, false
  136 + }
  137 + }
  138 +
  139 + // A decimal number?
  140 + if pos > 0 && pos < len(t.runes)-1 && unicode.IsNumber(t.runes[pos-1]) && unicode.IsNumber(t.runes[pos+1]) {
  141 + return false, false
  142 + }
  143 +
  144 + return true, false
  145 + }
81 146  
82 147 switch c {
83   - case '!', '?', '|', '·':
84   - return true
  148 + case '!', '?', '…':
  149 + return true, true
85 150  
86   - case '-', '~', '.':
87   - return (pos+1) == len(t.text) || t.text[pos+1] == ' '
  151 + case '|', '·', '–', '—', '►', '[', ']', ';', '>', '<', '^', '+', '=':
  152 + return true, false
88 153  
89   - // case ':':
90   - // return (pos+1) == len(t.text) || t.text[pos+1] != '/'
  154 + case '-':
  155 + if pos == 0 || pos == (len(t.runes)-1) {
  156 + return false, false
  157 + } else if unicode.IsNumber(t.runes[pos+1]) ||
  158 + ((unicode.IsLetter(t.runes[pos-1]) || unicode.IsNumber(t.runes[pos-1])) &&
  159 + (unicode.IsLetter(t.runes[pos+1]) || unicode.IsNumber(t.runes[pos+1]))) ||
  160 + t.runes[pos-1] == '.' ||
  161 + t.runes[pos+1] == '.' {
  162 + return false, false
  163 + } else {
  164 + return true, false
  165 + }
  166 +
  167 + case ':':
  168 + return len(t.tokens) > 0 || (pos+1) == len(t.runes) || pos == 0 || t.runes[pos+1] == ' ' || t.runes[pos-1] == ' ', false
91 169  
92 170 case ',':
93   - return t.comma && ((pos+1) == len(t.text) || t.text[pos+1] == ' ')
  171 + return t.comma && ((pos+1) == len(t.runes) || t.runes[pos+1] == ' '), false
  172 +
  173 + case '"', '”', '“':
  174 + return true, true
  175 +
  176 + case ' ', '$', '£', '€', '&', '%', '*', '#', '@', '~', '’', '\'', '‘':
  177 + return false, false
  178 +
  179 + default:
  180 + return (unicode.IsLetter(c) == false && unicode.IsNumber(c) == false), false
  181 +
94 182 }
95 183  
96   - return false
  184 + return false, false
97 185 }
parse/tweettokenizer.go
... ... @@ -4,6 +4,7 @@ package parse
4 4  
5 5 import (
6 6 "io"
  7 + "unicode"
7 8 )
8 9  
9 10 type TweetTokenzier struct {
... ... @@ -14,10 +15,18 @@ type TweetTokenzier struct {
14 15 tokens []string
15 16 entities []string
16 17 nonEntities []string
  18 + runes []rune
17 19 }
18 20  
19 21 func NewTweetTokenizer(text string) (tokenizer *TweetTokenzier) {
20   - tokenizer = &TweetTokenzier{text, 0, false, false, []string{}, []string{}, []string{}}
  22 + n := 0
  23 + runes := make([]rune, len(text))
  24 + for _, r := range text {
  25 + runes[n] = r
  26 + n++
  27 + }
  28 + runes = runes[0:n]
  29 + tokenizer = &TweetTokenzier{text, 0, false, false, []string{}, []string{}, []string{}, runes}
21 30 return tokenizer
22 31 }
23 32  
... ... @@ -58,22 +67,24 @@ func (t *TweetTokenzier) AllTokenTypes() (allTokens []string, entities []string,
58 67 func (t *TweetTokenzier) nextToken() (token string, entity bool, err error) {
59 68 t.currentTokenEntity = false
60 69 t.currentTokenURL = false
61   - for i := t.position; i < len(t.text); i++ {
  70 + for i := t.position; i < len(t.runes); i++ {
62 71 // no new token if the first char after the last delimiter is another delimiter
63 72 if delimiter := t.isDelimiter(i); delimiter && t.position == i {
64 73 t.position++
65 74 } else if delimiter {
66   - token = t.text[t.position:i]
  75 + token = string(t.runes[t.position:i])
67 76 t.position = i + 1
68 77 break
69   - } else if i == (len(t.text) - 1) { // We're at the last char and it's not a delimiter, so we must include it
70   - token = t.text[t.position : i+1]
  78 + } else if i == (len(t.runes) - 1) { // We're at the last char and it's not a delimiter, so we must include it
  79 + token = string(t.runes[t.position : i+1])
71 80 t.position = i + 1
72 81 break
73 82 }
74 83 }
75 84  
76   - if token == "1" {
  85 + if token == "0" {
  86 + token = "ZERO"
  87 + } else if token == "1" {
77 88 token = "ONE"
78 89 } else if token == "2" {
79 90 token = "TWO"
... ... @@ -91,9 +102,11 @@ func (t *TweetTokenzier) nextToken() (token string, entity bool, err error) {
91 102 token = "EIGHT"
92 103 } else if token == "9" {
93 104 token = "NINE"
  105 + } else if token == "000" {
  106 + token = "k"
94 107 }
95 108  
96   - if token == "" && t.position >= len(t.text)-1 {
  109 + if token == "" && t.position >= len(t.runes)-1 {
97 110 return token, false, io.EOF
98 111 }
99 112  
... ... @@ -101,18 +114,27 @@ func (t *TweetTokenzier) nextToken() (token string, entity bool, err error) {
101 114 }
102 115  
103 116 func (t *TweetTokenzier) isDelimiter(pos int) bool {
104   - c := t.text[pos]
  117 + c := t.runes[pos]
105 118  
106 119 switch c {
107 120 case ' ', '\t', '\n':
108 121 return true
109 122  
110   - case ']', '[', '!', '"', '$', '%', '&', '(', ')', '*', '+', ',', '.', '/',
  123 + case ']', '[', '!', '"', '$', '%', '&', '(', ')', '*', '+', '/',
111 124 ';', '<', '>', '=', '?', '\\', '^', '_', '{', '}', '|', '~', '-', '¬', '·':
112 125 return t.currentTokenURL != true
113 126  
  127 + case ',', '.':
  128 + if pos == 0 || pos == len(t.runes)-1 {
  129 + return true
  130 + } else if unicode.IsNumber(t.runes[pos+1]) && unicode.IsNumber(t.runes[pos-1]) {
  131 + return false
  132 + } else {
  133 + return true
  134 + }
  135 +
114 136 case ':':
115   - if t.currentTokenEntity == false && pos > 3 && t.text[pos-4:pos] != "http" {
  137 + if t.currentTokenEntity == false && pos > 3 && string(t.runes[pos-4:pos]) != "http" {
116 138 return true
117 139 } else {
118 140 t.currentTokenEntity = true
... ... @@ -60,7 +60,7 @@ func URLFilter(text string) (clean string, urls []string) {
60 60 start = strings.Index(clean, "https://")
61 61 }
62 62 if start >= 0 {
63   - end := strings.Index(clean[start:], " ")
  63 + end := strings.Index(clean[start:], " . ")
64 64 if end == -1 {
65 65 urls = append(urls, clean[start:])
66 66 clean = clean[:start]