mirror of
https://github.com/skidoodle/ctx.git
synced 2026-04-28 03:07:41 +02:00
Refine TokenCounter handling and output
Decode across Write calls and buffer partial runes. Approximate tokenization by splitting long alphanumeric runs (>4) and properly count spaces and punctuation. Remove TokenCounter Printf/Println helpers and update callers to use fmt.Fprintf/Fprintln. Avoid writing when the underlying writer is nil.
This commit is contained in:
@@ -4,14 +4,17 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
type TokenCounter struct {
|
||||
w io.Writer
|
||||
Count int64
|
||||
Err error
|
||||
inWord bool
|
||||
inSpace bool
|
||||
w io.Writer
|
||||
Count int64
|
||||
Err error
|
||||
|
||||
leftover []byte
|
||||
inWord bool
|
||||
wordLen int
|
||||
}
|
||||
|
||||
func (tc *TokenCounter) Write(p []byte) (int, error) {
|
||||
@@ -19,8 +22,29 @@ func (tc *TokenCounter) Write(p []byte) (int, error) {
|
||||
return 0, tc.Err
|
||||
}
|
||||
|
||||
for _, b := range p {
|
||||
r := rune(b)
|
||||
data := p
|
||||
if len(tc.leftover) > 0 {
|
||||
data = make([]byte, len(tc.leftover)+len(p))
|
||||
copy(data, tc.leftover)
|
||||
copy(data[len(tc.leftover):], p)
|
||||
}
|
||||
|
||||
totalProcessed := 0
|
||||
|
||||
for len(data) > 0 {
|
||||
r, size := utf8.DecodeRune(data)
|
||||
|
||||
if r == utf8.RuneError && size == 1 {
|
||||
if len(data) < utf8.UTFMax {
|
||||
tc.leftover = data
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
data = data[size:]
|
||||
totalProcessed += size
|
||||
tc.leftover = nil
|
||||
|
||||
isSpace := unicode.IsSpace(r)
|
||||
isAlpha := unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_'
|
||||
|
||||
@@ -28,24 +52,32 @@ func (tc *TokenCounter) Write(p []byte) (int, error) {
|
||||
if !tc.inWord {
|
||||
tc.Count++
|
||||
tc.inWord = true
|
||||
tc.inSpace = false
|
||||
tc.wordLen = 1
|
||||
} else {
|
||||
tc.wordLen++
|
||||
if tc.wordLen > 4 {
|
||||
tc.Count++
|
||||
tc.wordLen = 1
|
||||
}
|
||||
}
|
||||
} else if isSpace {
|
||||
if !tc.inSpace {
|
||||
tc.Count++
|
||||
tc.inSpace = true
|
||||
tc.inWord = false
|
||||
}
|
||||
tc.inWord = false
|
||||
tc.wordLen = 0
|
||||
} else {
|
||||
tc.Count++
|
||||
tc.inWord = false
|
||||
tc.inSpace = false
|
||||
tc.wordLen = 0
|
||||
}
|
||||
}
|
||||
|
||||
n, err := tc.w.Write(p)
|
||||
tc.Err = err
|
||||
return n, err
|
||||
var n int
|
||||
if tc.w != nil {
|
||||
n, tc.Err = tc.w.Write(p)
|
||||
} else {
|
||||
n = len(p)
|
||||
}
|
||||
|
||||
return n, tc.Err
|
||||
}
|
||||
|
||||
func (tc *TokenCounter) WriteByte(c byte) error {
|
||||
@@ -57,12 +89,5 @@ func (tc *TokenCounter) Printf(format string, a ...any) {
|
||||
if tc.Err != nil {
|
||||
return
|
||||
}
|
||||
_, _ = fmt.Fprintf(tc, format, a...)
|
||||
}
|
||||
|
||||
func (tc *TokenCounter) Println(a ...any) {
|
||||
if tc.Err != nil {
|
||||
return
|
||||
}
|
||||
_, _ = fmt.Fprintln(tc, a...)
|
||||
fmt.Fprintf(tc, format, a...)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user