init claude-code
This commit is contained in:
+300
@@ -0,0 +1,300 @@
|
||||
import { randomUUID } from 'crypto'
|
||||
import { mkdir, readdir, readFile } from 'fs/promises'
|
||||
import { join } from 'path'
|
||||
import {
|
||||
PDF_MAX_EXTRACT_SIZE,
|
||||
PDF_TARGET_RAW_SIZE,
|
||||
} from '../constants/apiLimits.js'
|
||||
import { errorMessage } from './errors.js'
|
||||
import { execFileNoThrow } from './execFileNoThrow.js'
|
||||
import { formatFileSize } from './format.js'
|
||||
import { getFsImplementation } from './fsOperations.js'
|
||||
import { getToolResultsDir } from './toolResultStorage.js'
|
||||
|
||||
export type PDFError = {
|
||||
reason:
|
||||
| 'empty'
|
||||
| 'too_large'
|
||||
| 'password_protected'
|
||||
| 'corrupted'
|
||||
| 'unknown'
|
||||
| 'unavailable'
|
||||
message: string
|
||||
}
|
||||
|
||||
export type PDFResult<T> =
|
||||
| { success: true; data: T }
|
||||
| { success: false; error: PDFError }
|
||||
|
||||
/**
|
||||
* Read a PDF file and return it as base64-encoded data.
|
||||
* @param filePath Path to the PDF file
|
||||
* @returns Result containing PDF data or a structured error
|
||||
*/
|
||||
export async function readPDF(filePath: string): Promise<
|
||||
PDFResult<{
|
||||
type: 'pdf'
|
||||
file: {
|
||||
filePath: string
|
||||
base64: string
|
||||
originalSize: number
|
||||
}
|
||||
}>
|
||||
> {
|
||||
try {
|
||||
const fs = getFsImplementation()
|
||||
const stats = await fs.stat(filePath)
|
||||
const originalSize = stats.size
|
||||
|
||||
// Check if file is empty
|
||||
if (originalSize === 0) {
|
||||
return {
|
||||
success: false,
|
||||
error: { reason: 'empty', message: `PDF file is empty: ${filePath}` },
|
||||
}
|
||||
}
|
||||
|
||||
// Check if PDF exceeds maximum size
|
||||
// The API has a 32MB total request limit. After base64 encoding (~33% larger),
|
||||
// a PDF must be under ~20MB raw to leave room for conversation context.
|
||||
if (originalSize > PDF_TARGET_RAW_SIZE) {
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
reason: 'too_large',
|
||||
message: `PDF file exceeds maximum allowed size of ${formatFileSize(PDF_TARGET_RAW_SIZE)}.`,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
const fileBuffer = await readFile(filePath)
|
||||
|
||||
// Validate PDF magic bytes — reject files that aren't actually PDFs
|
||||
// (e.g., HTML files renamed to .pdf) before they enter conversation context.
|
||||
// Once an invalid PDF document block is in the message history, every subsequent
|
||||
// API call fails with 400 "The PDF specified was not valid" and the session
|
||||
// becomes unrecoverable without /clear.
|
||||
const header = fileBuffer.subarray(0, 5).toString('ascii')
|
||||
if (!header.startsWith('%PDF-')) {
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
reason: 'corrupted',
|
||||
message: `File is not a valid PDF (missing %PDF- header): ${filePath}`,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
const base64 = fileBuffer.toString('base64')
|
||||
|
||||
// Note: We cannot check page count here without parsing the PDF
|
||||
// The API will enforce the 100-page limit and return an error if exceeded
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: {
|
||||
type: 'pdf',
|
||||
file: {
|
||||
filePath,
|
||||
base64,
|
||||
originalSize,
|
||||
},
|
||||
},
|
||||
}
|
||||
} catch (e: unknown) {
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
reason: 'unknown',
|
||||
message: errorMessage(e),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of pages in a PDF file using `pdfinfo` (from poppler-utils).
|
||||
* Returns `null` if pdfinfo is not available or if the page count cannot be determined.
|
||||
*/
|
||||
export async function getPDFPageCount(
|
||||
filePath: string,
|
||||
): Promise<number | null> {
|
||||
const { code, stdout } = await execFileNoThrow('pdfinfo', [filePath], {
|
||||
timeout: 10_000,
|
||||
useCwd: false,
|
||||
})
|
||||
if (code !== 0) {
|
||||
return null
|
||||
}
|
||||
const match = /^Pages:\s+(\d+)/m.exec(stdout)
|
||||
if (!match) {
|
||||
return null
|
||||
}
|
||||
const count = parseInt(match[1]!, 10)
|
||||
return isNaN(count) ? null : count
|
||||
}
|
||||
|
||||
export type PDFExtractPagesResult = {
|
||||
type: 'parts'
|
||||
file: {
|
||||
filePath: string
|
||||
originalSize: number
|
||||
count: number
|
||||
outputDir: string
|
||||
}
|
||||
}
|
||||
|
||||
let pdftoppmAvailable: boolean | undefined
|
||||
|
||||
/**
|
||||
* Reset the pdftoppm availability cache. Used by tests only.
|
||||
*/
|
||||
export function resetPdftoppmCache(): void {
|
||||
pdftoppmAvailable = undefined
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether the `pdftoppm` binary (from poppler-utils) is available.
|
||||
* The result is cached for the lifetime of the process.
|
||||
*/
|
||||
export async function isPdftoppmAvailable(): Promise<boolean> {
|
||||
if (pdftoppmAvailable !== undefined) return pdftoppmAvailable
|
||||
const { code, stderr } = await execFileNoThrow('pdftoppm', ['-v'], {
|
||||
timeout: 5000,
|
||||
useCwd: false,
|
||||
})
|
||||
// pdftoppm prints version info to stderr and exits 0 (or sometimes 99 on older versions)
|
||||
pdftoppmAvailable = code === 0 || stderr.length > 0
|
||||
return pdftoppmAvailable
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract PDF pages as JPEG images using pdftoppm.
|
||||
* Produces page-01.jpg, page-02.jpg, etc. in an output directory.
|
||||
* This enables reading large PDFs and works with all API providers.
|
||||
*
|
||||
* @param filePath Path to the PDF file
|
||||
* @param options Optional page range (1-indexed, inclusive)
|
||||
*/
|
||||
export async function extractPDFPages(
|
||||
filePath: string,
|
||||
options?: { firstPage?: number; lastPage?: number },
|
||||
): Promise<PDFResult<PDFExtractPagesResult>> {
|
||||
try {
|
||||
const fs = getFsImplementation()
|
||||
const stats = await fs.stat(filePath)
|
||||
const originalSize = stats.size
|
||||
|
||||
if (originalSize === 0) {
|
||||
return {
|
||||
success: false,
|
||||
error: { reason: 'empty', message: `PDF file is empty: ${filePath}` },
|
||||
}
|
||||
}
|
||||
|
||||
if (originalSize > PDF_MAX_EXTRACT_SIZE) {
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
reason: 'too_large',
|
||||
message: `PDF file exceeds maximum allowed size for text extraction (${formatFileSize(PDF_MAX_EXTRACT_SIZE)}).`,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
const available = await isPdftoppmAvailable()
|
||||
if (!available) {
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
reason: 'unavailable',
|
||||
message:
|
||||
'pdftoppm is not installed. Install poppler-utils (e.g. `brew install poppler` or `apt-get install poppler-utils`) to enable PDF page rendering.',
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
const uuid = randomUUID()
|
||||
const outputDir = join(getToolResultsDir(), `pdf-${uuid}`)
|
||||
await mkdir(outputDir, { recursive: true })
|
||||
|
||||
// pdftoppm produces files like <prefix>-01.jpg, <prefix>-02.jpg, etc.
|
||||
const prefix = join(outputDir, 'page')
|
||||
const args = ['-jpeg', '-r', '100']
|
||||
if (options?.firstPage) {
|
||||
args.push('-f', String(options.firstPage))
|
||||
}
|
||||
if (options?.lastPage && options.lastPage !== Infinity) {
|
||||
args.push('-l', String(options.lastPage))
|
||||
}
|
||||
args.push(filePath, prefix)
|
||||
const { code, stderr } = await execFileNoThrow('pdftoppm', args, {
|
||||
timeout: 120_000,
|
||||
useCwd: false,
|
||||
})
|
||||
|
||||
if (code !== 0) {
|
||||
if (/password/i.test(stderr)) {
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
reason: 'password_protected',
|
||||
message:
|
||||
'PDF is password-protected. Please provide an unprotected version.',
|
||||
},
|
||||
}
|
||||
}
|
||||
if (/damaged|corrupt|invalid/i.test(stderr)) {
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
reason: 'corrupted',
|
||||
message: 'PDF file is corrupted or invalid.',
|
||||
},
|
||||
}
|
||||
}
|
||||
return {
|
||||
success: false,
|
||||
error: { reason: 'unknown', message: `pdftoppm failed: ${stderr}` },
|
||||
}
|
||||
}
|
||||
|
||||
// Read generated image files and sort naturally
|
||||
const entries = await readdir(outputDir)
|
||||
const imageFiles = entries.filter(f => f.endsWith('.jpg')).sort()
|
||||
const pageCount = imageFiles.length
|
||||
|
||||
if (pageCount === 0) {
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
reason: 'corrupted',
|
||||
message: 'pdftoppm produced no output pages. The PDF may be invalid.',
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
const count = imageFiles.length
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: {
|
||||
type: 'parts',
|
||||
file: {
|
||||
filePath,
|
||||
originalSize,
|
||||
outputDir,
|
||||
count,
|
||||
},
|
||||
},
|
||||
}
|
||||
} catch (e: unknown) {
|
||||
return {
|
||||
success: false,
|
||||
error: {
|
||||
reason: 'unknown',
|
||||
message: errorMessage(e),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user