Refine TokenCounter handling and output

Decode across Write calls and buffer partial runes. Approximate
tokenization by splitting long alphanumeric runs (>4) and properly count
spaces and punctuation. Remove TokenCounter Printf/Println helpers and
update callers to use fmt.Fprintf/Fprintln. Avoid writing when the
underlying writer is nil.
This commit is contained in:
2026-01-31 03:24:17 +01:00
parent 4453856fd1
commit 14dfacea29
2 changed files with 64 additions and 39 deletions
+50 -25
View File
@@ -4,14 +4,17 @@ import (
"fmt"
"io"
"unicode"
"unicode/utf8"
)
type TokenCounter struct {
w io.Writer
Count int64
Err error
inWord bool
inSpace bool
w io.Writer
Count int64
Err error
leftover []byte
inWord bool
wordLen int
}
func (tc *TokenCounter) Write(p []byte) (int, error) {
@@ -19,8 +22,29 @@ func (tc *TokenCounter) Write(p []byte) (int, error) {
return 0, tc.Err
}
for _, b := range p {
r := rune(b)
data := p
if len(tc.leftover) > 0 {
data = make([]byte, len(tc.leftover)+len(p))
copy(data, tc.leftover)
copy(data[len(tc.leftover):], p)
}
totalProcessed := 0
for len(data) > 0 {
r, size := utf8.DecodeRune(data)
if r == utf8.RuneError && size == 1 {
if len(data) < utf8.UTFMax {
tc.leftover = data
break
}
}
data = data[size:]
totalProcessed += size
tc.leftover = nil
isSpace := unicode.IsSpace(r)
isAlpha := unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_'
@@ -28,24 +52,32 @@ func (tc *TokenCounter) Write(p []byte) (int, error) {
if !tc.inWord {
tc.Count++
tc.inWord = true
tc.inSpace = false
tc.wordLen = 1
} else {
tc.wordLen++
if tc.wordLen > 4 {
tc.Count++
tc.wordLen = 1
}
}
} else if isSpace {
if !tc.inSpace {
tc.Count++
tc.inSpace = true
tc.inWord = false
}
tc.inWord = false
tc.wordLen = 0
} else {
tc.Count++
tc.inWord = false
tc.inSpace = false
tc.wordLen = 0
}
}
n, err := tc.w.Write(p)
tc.Err = err
return n, err
var n int
if tc.w != nil {
n, tc.Err = tc.w.Write(p)
} else {
n = len(p)
}
return n, tc.Err
}
func (tc *TokenCounter) WriteByte(c byte) error {
@@ -57,12 +89,5 @@ func (tc *TokenCounter) Printf(format string, a ...any) {
if tc.Err != nil {
return
}
_, _ = fmt.Fprintf(tc, format, a...)
}
func (tc *TokenCounter) Println(a ...any) {
if tc.Err != nil {
return
}
_, _ = fmt.Fprintln(tc, a...)
fmt.Fprintf(tc, format, a...)
}