mirror of
https://github.com/skidoodle/ctx.git
synced 2026-04-28 03:07:41 +02:00
Refine TokenCounter handling and output
Decode across Write calls and buffer partial runes. Approximate tokenization by splitting long alphanumeric runs (>4) and properly count spaces and punctuation. Remove TokenCounter Printf/Println helpers and update callers to use fmt.Fprintf/Fprintln. Avoid writing when the underlying writer is nil.
This commit is contained in:
@@ -30,19 +30,19 @@ func writeOutput(root string, files []string, outputPath string) (count int64, e
|
||||
|
||||
tc := &TokenCounter{w: bw}
|
||||
|
||||
tc.Printf("Project Path: %s\n\n", filepath.Base(root))
|
||||
tc.Println("Source Tree:")
|
||||
tc.Println("")
|
||||
fmt.Fprintf(tc, "Project Path: %s\n\n", filepath.Base(root))
|
||||
fmt.Fprintln(tc, "Source Tree:")
|
||||
fmt.Fprintln(tc, "")
|
||||
|
||||
tc.Println("```txt")
|
||||
tc.Println(filepath.Base(root))
|
||||
fmt.Fprintln(tc, "```txt")
|
||||
fmt.Fprintln(tc, filepath.Base(root))
|
||||
|
||||
if err := writeTree(tc, files); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
tc.Println("```")
|
||||
tc.Println("")
|
||||
fmt.Fprintln(tc, "```")
|
||||
fmt.Fprintln(tc, "")
|
||||
|
||||
for _, file := range files {
|
||||
if file == outputPath || filepath.Base(file) == outputPath {
|
||||
@@ -52,7 +52,7 @@ func writeOutput(root string, files []string, outputPath string) (count int64, e
|
||||
fullPath := filepath.Join(root, file)
|
||||
content, err := os.ReadFile(fullPath)
|
||||
if err != nil {
|
||||
tc.Printf("Error reading %s: %v\n", file, err)
|
||||
fmt.Fprintf(tc, "Error reading %s: %v\n", file, err)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -61,8 +61,8 @@ func writeOutput(root string, files []string, outputPath string) (count int64, e
|
||||
ext = "txt"
|
||||
}
|
||||
|
||||
tc.Printf("`%s`:\n\n", file)
|
||||
tc.Printf("```%s\n", ext)
|
||||
fmt.Fprintf(tc, "`%s`:\n\n", file)
|
||||
fmt.Fprintf(tc, "```%s\n", ext)
|
||||
|
||||
if _, err := tc.Write(content); err != nil {
|
||||
return 0, err
|
||||
@@ -73,8 +73,8 @@ func writeOutput(root string, files []string, outputPath string) (count int64, e
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
tc.Println("```")
|
||||
tc.Println("")
|
||||
fmt.Fprintln(tc, "```")
|
||||
fmt.Fprintln(tc, "")
|
||||
}
|
||||
|
||||
return tc.Count, tc.Err
|
||||
@@ -116,9 +116,9 @@ func printNode(w io.Writer, node map[string]any, prefix string) error {
|
||||
|
||||
children := node[key].(map[string]any)
|
||||
if len(children) > 0 {
|
||||
childPrefix := prefix + "│ "
|
||||
childPrefix := prefix + "│ "
|
||||
if isLast {
|
||||
childPrefix = prefix + " "
|
||||
childPrefix = prefix + " "
|
||||
}
|
||||
if err := printNode(w, children, childPrefix); err != nil {
|
||||
return err
|
||||
|
||||
@@ -4,14 +4,17 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
type TokenCounter struct {
|
||||
w io.Writer
|
||||
Count int64
|
||||
Err error
|
||||
inWord bool
|
||||
inSpace bool
|
||||
w io.Writer
|
||||
Count int64
|
||||
Err error
|
||||
|
||||
leftover []byte
|
||||
inWord bool
|
||||
wordLen int
|
||||
}
|
||||
|
||||
func (tc *TokenCounter) Write(p []byte) (int, error) {
|
||||
@@ -19,8 +22,29 @@ func (tc *TokenCounter) Write(p []byte) (int, error) {
|
||||
return 0, tc.Err
|
||||
}
|
||||
|
||||
for _, b := range p {
|
||||
r := rune(b)
|
||||
data := p
|
||||
if len(tc.leftover) > 0 {
|
||||
data = make([]byte, len(tc.leftover)+len(p))
|
||||
copy(data, tc.leftover)
|
||||
copy(data[len(tc.leftover):], p)
|
||||
}
|
||||
|
||||
totalProcessed := 0
|
||||
|
||||
for len(data) > 0 {
|
||||
r, size := utf8.DecodeRune(data)
|
||||
|
||||
if r == utf8.RuneError && size == 1 {
|
||||
if len(data) < utf8.UTFMax {
|
||||
tc.leftover = data
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
data = data[size:]
|
||||
totalProcessed += size
|
||||
tc.leftover = nil
|
||||
|
||||
isSpace := unicode.IsSpace(r)
|
||||
isAlpha := unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_'
|
||||
|
||||
@@ -28,24 +52,32 @@ func (tc *TokenCounter) Write(p []byte) (int, error) {
|
||||
if !tc.inWord {
|
||||
tc.Count++
|
||||
tc.inWord = true
|
||||
tc.inSpace = false
|
||||
tc.wordLen = 1
|
||||
} else {
|
||||
tc.wordLen++
|
||||
if tc.wordLen > 4 {
|
||||
tc.Count++
|
||||
tc.wordLen = 1
|
||||
}
|
||||
}
|
||||
} else if isSpace {
|
||||
if !tc.inSpace {
|
||||
tc.Count++
|
||||
tc.inSpace = true
|
||||
tc.inWord = false
|
||||
}
|
||||
tc.inWord = false
|
||||
tc.wordLen = 0
|
||||
} else {
|
||||
tc.Count++
|
||||
tc.inWord = false
|
||||
tc.inSpace = false
|
||||
tc.wordLen = 0
|
||||
}
|
||||
}
|
||||
|
||||
n, err := tc.w.Write(p)
|
||||
tc.Err = err
|
||||
return n, err
|
||||
var n int
|
||||
if tc.w != nil {
|
||||
n, tc.Err = tc.w.Write(p)
|
||||
} else {
|
||||
n = len(p)
|
||||
}
|
||||
|
||||
return n, tc.Err
|
||||
}
|
||||
|
||||
func (tc *TokenCounter) WriteByte(c byte) error {
|
||||
@@ -57,12 +89,5 @@ func (tc *TokenCounter) Printf(format string, a ...any) {
|
||||
if tc.Err != nil {
|
||||
return
|
||||
}
|
||||
_, _ = fmt.Fprintf(tc, format, a...)
|
||||
}
|
||||
|
||||
func (tc *TokenCounter) Println(a ...any) {
|
||||
if tc.Err != nil {
|
||||
return
|
||||
}
|
||||
_, _ = fmt.Fprintln(tc, a...)
|
||||
fmt.Fprintf(tc, format, a...)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user