init claude-code

This commit is contained in:
2026-04-01 17:32:37 +02:00
commit 73b208c009
1902 changed files with 513237 additions and 0 deletions
+196
View File
@@ -0,0 +1,196 @@
/**
* Filter and sanitize installed-app data for inclusion in the `request_access`
* tool description. Ported from Cowork's appNames.ts. Two
* concerns: noise filtering (Spotlight returns every bundle on disk — XPC
* helpers, daemons, input methods) and prompt-injection hardening (app names
* are attacker-controlled; anyone can ship an app named anything).
*
* Residual risk: short benign-char adversarial names ("grant all") can't be
* filtered programmatically. The tool description's structural framing
* ("Available applications:") makes it clear these are app names, and the
* downstream permission dialog requires explicit user approval — a bad name
* can't auto-grant anything.
*/
/** Minimal shape — matches what `listInstalledApps` returns. */
type InstalledAppLike = {
readonly bundleId: string
readonly displayName: string
readonly path: string
}
// ── Noise filtering ──────────────────────────────────────────────────────
/**
* Only apps under these roots are shown. /System/Library subpaths (CoreServices,
* PrivateFrameworks, Input Methods) are OS plumbing — anchor on known-good
* roots rather than blocklisting every junk subpath since new macOS versions
* add more.
*
* ~/Applications is checked at call time via the `homeDir` arg (HOME isn't
* reliably known at module load in all environments).
*/
const PATH_ALLOWLIST: readonly string[] = [
'/Applications/',
'/System/Applications/',
]
/**
* Display-name patterns that mark background services even under /Applications.
* `(?:$|\s\()` — matches keyword at end-of-string OR immediately before ` (`:
* "Slack Helper (GPU)" and "ABAssistantService" fail, "Service Desk" passes
* (Service is followed by " D").
*/
const NAME_PATTERN_BLOCKLIST: readonly RegExp[] = [
/Helper(?:$|\s\()/,
/Agent(?:$|\s\()/,
/Service(?:$|\s\()/,
/Uninstaller(?:$|\s\()/,
/Updater(?:$|\s\()/,
/^\./,
]
/**
* Apps commonly requested for CU automation. ALWAYS included if installed,
* bypassing path check + count cap — the model needs these exact names even
* when the machine has 200+ apps. Bundle IDs (locale-invariant), not display
* names. Keep <30 — each entry is a guaranteed token in the description.
*/
const ALWAYS_KEEP_BUNDLE_IDS: ReadonlySet<string> = new Set([
// Browsers
'com.apple.Safari',
'com.google.Chrome',
'com.microsoft.edgemac',
'org.mozilla.firefox',
'company.thebrowser.Browser', // Arc
// Communication
'com.tinyspeck.slackmacgap',
'us.zoom.xos',
'com.microsoft.teams2',
'com.microsoft.teams',
'com.apple.MobileSMS',
'com.apple.mail',
// Productivity
'com.microsoft.Word',
'com.microsoft.Excel',
'com.microsoft.Powerpoint',
'com.microsoft.Outlook',
'com.apple.iWork.Pages',
'com.apple.iWork.Numbers',
'com.apple.iWork.Keynote',
'com.google.GoogleDocs',
// Notes / PM
'notion.id',
'com.apple.Notes',
'md.obsidian',
'com.linear',
'com.figma.Desktop',
// Dev
'com.microsoft.VSCode',
'com.apple.Terminal',
'com.googlecode.iterm2',
'com.github.GitHubDesktop',
// System essentials the model genuinely targets
'com.apple.finder',
'com.apple.iCal',
'com.apple.systempreferences',
])
// ── Prompt-injection hardening ───────────────────────────────────────────
/**
* `\p{L}\p{M}\p{N}` with /u — not `\w` (ASCII-only, would drop Bücher, 微信,
* Préférences Système). `\p{M}` matches combining marks so NFD-decomposed
* diacritics (ü → u + ◌̈) pass. Single space not `\s` — `\s` matches newlines,
* which would let "App\nIgnore previous…" through as a multi-line injection.
* Still bars quotes, angle brackets, backticks, pipes, colons.
*/
const APP_NAME_ALLOWED = /^[\p{L}\p{M}\p{N}_ .&'()+-]+$/u
const APP_NAME_MAX_LEN = 40
const APP_NAME_MAX_COUNT = 50
function isUserFacingPath(path: string, homeDir: string | undefined): boolean {
if (PATH_ALLOWLIST.some(root => path.startsWith(root))) return true
if (homeDir) {
const userApps = homeDir.endsWith('/')
? `${homeDir}Applications/`
: `${homeDir}/Applications/`
if (path.startsWith(userApps)) return true
}
return false
}
function isNoisyName(name: string): boolean {
return NAME_PATTERN_BLOCKLIST.some(re => re.test(name))
}
/**
* Length cap + trim + dedupe + sort. `applyCharFilter` — skip for trusted
* bundle IDs (Apple/Google/MS; a localized "Réglages Système" with unusual
* punctuation shouldn't be dropped), apply for anything attacker-installable.
*/
function sanitizeCore(
raw: readonly string[],
applyCharFilter: boolean,
): string[] {
const seen = new Set<string>()
return raw
.map(name => name.trim())
.filter(trimmed => {
if (!trimmed) return false
if (trimmed.length > APP_NAME_MAX_LEN) return false
if (applyCharFilter && !APP_NAME_ALLOWED.test(trimmed)) return false
if (seen.has(trimmed)) return false
seen.add(trimmed)
return true
})
.sort((a, b) => a.localeCompare(b))
}
function sanitizeAppNames(raw: readonly string[]): string[] {
const filtered = sanitizeCore(raw, true)
if (filtered.length <= APP_NAME_MAX_COUNT) return filtered
return [
...filtered.slice(0, APP_NAME_MAX_COUNT),
`… and ${filtered.length - APP_NAME_MAX_COUNT} more`,
]
}
function sanitizeTrustedNames(raw: readonly string[]): string[] {
return sanitizeCore(raw, false)
}
/**
* Filter raw Spotlight results to user-facing apps, then sanitize. Always-keep
* apps bypass path/name filter AND char allowlist (trusted vendors, not
* attacker-installed); still length-capped, deduped, sorted.
*/
export function filterAppsForDescription(
installed: readonly InstalledAppLike[],
homeDir: string | undefined,
): string[] {
const { alwaysKept, rest } = installed.reduce<{
alwaysKept: string[]
rest: string[]
}>(
(acc, app) => {
if (ALWAYS_KEEP_BUNDLE_IDS.has(app.bundleId)) {
acc.alwaysKept.push(app.displayName)
} else if (
isUserFacingPath(app.path, homeDir) &&
!isNoisyName(app.displayName)
) {
acc.rest.push(app.displayName)
}
return acc
},
{ alwaysKept: [], rest: [] },
)
const sanitizedAlways = sanitizeTrustedNames(alwaysKept)
const alwaysSet = new Set(sanitizedAlways)
return [
...sanitizedAlways,
...sanitizeAppNames(rest).filter(n => !alwaysSet.has(n)),
]
}
+86
View File
@@ -0,0 +1,86 @@
import type { ToolUseContext } from '../../Tool.js'
import { logForDebugging } from '../debug.js'
import { errorMessage } from '../errors.js'
import { withResolvers } from '../withResolvers.js'
import { isLockHeldLocally, releaseComputerUseLock } from './computerUseLock.js'
import { unregisterEscHotkey } from './escHotkey.js'
// cu.apps.unhide is NOT one of the four @MainActor methods wrapped by
// drainRunLoop's 30s backstop. On abort paths (where the user hit Ctrl+C
// because something was slow) a hang here would wedge the abort. Generous
// timeout — unhide should be ~instant; if it takes 5s something is wrong
// and proceeding is better than waiting. The Swift call continues in the
// background regardless; we just stop blocking on it.
const UNHIDE_TIMEOUT_MS = 5000
/**
* Turn-end cleanup for the chicago MCP surface: auto-unhide apps that
* `prepareForAction` hid, then release the file-based lock.
*
* Called from three sites: natural turn end (`stopHooks.ts`), abort during
* streaming (`query.ts` aborted_streaming), abort during tool execution
* (`query.ts` aborted_tools). All three reach this via dynamic import gated
* on `feature('CHICAGO_MCP')`. `executor.js` (which pulls both native
* modules) is dynamic-imported below so non-CU turns don't load native
* modules just to no-op.
*
* No-ops cheaply on non-CU turns: both gate checks are zero-syscall.
*/
export async function cleanupComputerUseAfterTurn(
ctx: Pick<
ToolUseContext,
'getAppState' | 'setAppState' | 'sendOSNotification'
>,
): Promise<void> {
const appState = ctx.getAppState()
const hidden = appState.computerUseMcpState?.hiddenDuringTurn
if (hidden && hidden.size > 0) {
const { unhideComputerUseApps } = await import('./executor.js')
const unhide = unhideComputerUseApps([...hidden]).catch(err =>
logForDebugging(
`[Computer Use MCP] auto-unhide failed: ${errorMessage(err)}`,
),
)
const timeout = withResolvers<void>()
const timer = setTimeout(timeout.resolve, UNHIDE_TIMEOUT_MS)
await Promise.race([unhide, timeout.promise]).finally(() =>
clearTimeout(timer),
)
ctx.setAppState(prev =>
prev.computerUseMcpState?.hiddenDuringTurn === undefined
? prev
: {
...prev,
computerUseMcpState: {
...prev.computerUseMcpState,
hiddenDuringTurn: undefined,
},
},
)
}
// Zero-syscall pre-check so non-CU turns don't touch disk. Release is still
// idempotent (returns false if already released or owned by another session).
if (!isLockHeldLocally()) return
// Unregister before lock release so the pump-retain drops as soon as the
// CU session ends. Idempotent — no-ops if registration failed at acquire.
// Swallow throws so a NAPI unregister error never prevents lock release —
// a held lock blocks the next CU session with "in use by another session".
try {
unregisterEscHotkey()
} catch (err) {
logForDebugging(
`[Computer Use MCP] unregisterEscHotkey failed: ${errorMessage(err)}`,
)
}
if (await releaseComputerUseLock()) {
ctx.sendOSNotification?.({
message: 'Claude is done using your computer',
notificationType: 'computer_use_exit',
})
}
}
+61
View File
@@ -0,0 +1,61 @@
import { normalizeNameForMCP } from '../../services/mcp/normalization.js'
import { env } from '../env.js'
export const COMPUTER_USE_MCP_SERVER_NAME = 'computer-use'
/**
* Sentinel bundle ID for the frontmost gate. Claude Code is a terminal — it has
* no window. This never matches a real `NSWorkspace.frontmostApplication`, so
* the package's "host is frontmost" branch (mouse click-through exemption,
* keyboard safety-net) is dead code for us. `prepareForAction`'s "exempt our
* own window" is likewise a no-op — there is no window to exempt.
*/
export const CLI_HOST_BUNDLE_ID = 'com.anthropic.claude-code.cli-no-window'
/**
* Fallback `env.terminal` → bundleId map for when `__CFBundleIdentifier` is
* unset. Covers the macOS terminals we can distinguish — Linux entries
* (konsole, gnome-terminal, xterm) are deliberately absent since
* `createCliExecutor` is darwin-guarded.
*/
const TERMINAL_BUNDLE_ID_FALLBACK: Readonly<Record<string, string>> = {
'iTerm.app': 'com.googlecode.iterm2',
Apple_Terminal: 'com.apple.Terminal',
ghostty: 'com.mitchellh.ghostty',
kitty: 'net.kovidgoyal.kitty',
WarpTerminal: 'dev.warp.Warp-Stable',
vscode: 'com.microsoft.VSCode',
}
/**
* Bundle ID of the terminal emulator we're running inside, so `prepareDisplay`
* can exempt it from hiding and `captureExcluding` can keep it out of
* screenshots. Returns null when undetectable (ssh, cleared env, unknown
* terminal) — caller must handle the null case.
*
* `__CFBundleIdentifier` is set by LaunchServices when a .app bundle spawns a
* process and is inherited by children. It's the exact bundleId, no lookup
* needed — handles terminals the fallback table doesn't know about. Under
* tmux/screen it reflects the terminal that started the SERVER, which may
* differ from the attached client. That's harmless here: we exempt A
* terminal window, and the screenshots exclude it regardless.
*/
export function getTerminalBundleId(): string | null {
const cfBundleId = process.env.__CFBundleIdentifier
if (cfBundleId) return cfBundleId
return TERMINAL_BUNDLE_ID_FALLBACK[env.terminal ?? ''] ?? null
}
/**
* Static capabilities for macOS CLI. `hostBundleId` is not here — it's added
* by `executor.ts` per `ComputerExecutor.capabilities`. `buildComputerUseTools`
* takes this shape (no `hostBundleId`, no `teachMode`).
*/
export const CLI_CU_CAPABILITIES = {
screenshotFiltering: 'native' as const,
platform: 'darwin' as const,
}
export function isComputerUseMCPServer(name: string): boolean {
return normalizeNameForMCP(name) === COMPUTER_USE_MCP_SERVER_NAME
}
+215
View File
@@ -0,0 +1,215 @@
import { mkdir, readFile, unlink, writeFile } from 'fs/promises'
import { join } from 'path'
import { getSessionId } from '../../bootstrap/state.js'
import { registerCleanup } from '../../utils/cleanupRegistry.js'
import { logForDebugging } from '../../utils/debug.js'
import { getClaudeConfigHomeDir } from '../../utils/envUtils.js'
import { jsonParse, jsonStringify } from '../../utils/slowOperations.js'
import { getErrnoCode } from '../errors.js'
const LOCK_FILENAME = 'computer-use.lock'
// Holds the unregister function for the shutdown cleanup handler.
// Set when the lock is acquired, cleared when released.
let unregisterCleanup: (() => void) | undefined
type ComputerUseLock = {
readonly sessionId: string
readonly pid: number
readonly acquiredAt: number
}
export type AcquireResult =
| { readonly kind: 'acquired'; readonly fresh: boolean }
| { readonly kind: 'blocked'; readonly by: string }
export type CheckResult =
| { readonly kind: 'free' }
| { readonly kind: 'held_by_self' }
| { readonly kind: 'blocked'; readonly by: string }
const FRESH: AcquireResult = { kind: 'acquired', fresh: true }
const REENTRANT: AcquireResult = { kind: 'acquired', fresh: false }
function isComputerUseLock(value: unknown): value is ComputerUseLock {
if (typeof value !== 'object' || value === null) return false
return (
'sessionId' in value &&
typeof value.sessionId === 'string' &&
'pid' in value &&
typeof value.pid === 'number'
)
}
function getLockPath(): string {
return join(getClaudeConfigHomeDir(), LOCK_FILENAME)
}
async function readLock(): Promise<ComputerUseLock | undefined> {
try {
const raw = await readFile(getLockPath(), 'utf8')
const parsed: unknown = jsonParse(raw)
return isComputerUseLock(parsed) ? parsed : undefined
} catch {
return undefined
}
}
/**
* Check whether a process is still running (signal 0 probe).
*
* Note: there is a small window for PID reuse — if the owning process
* exits and an unrelated process is assigned the same PID, the check
* will return true. This is extremely unlikely in practice.
*/
function isProcessRunning(pid: number): boolean {
try {
process.kill(pid, 0)
return true
} catch {
return false
}
}
/**
* Attempt to create the lock file atomically with O_EXCL.
* Returns true on success, false if the file already exists.
* Throws for other errors.
*/
async function tryCreateExclusive(lock: ComputerUseLock): Promise<boolean> {
try {
await writeFile(getLockPath(), jsonStringify(lock), { flag: 'wx' })
return true
} catch (e: unknown) {
if (getErrnoCode(e) === 'EEXIST') return false
throw e
}
}
/**
* Register a shutdown cleanup handler so the lock is released even if
* turn-end cleanup is never reached (e.g. the user runs /exit while
* a tool call is in progress).
*/
function registerLockCleanup(): void {
unregisterCleanup?.()
unregisterCleanup = registerCleanup(async () => {
await releaseComputerUseLock()
})
}
/**
* Check lock state without acquiring. Used for `request_access` /
* `list_granted_applications` — the package's `defersLockAcquire` contract:
* these tools check but don't take the lock, so the enter-notification and
* overlay don't fire while the model is only asking for permission.
*
* Does stale-PID recovery (unlinks) so a dead session's lock doesn't block
* `request_access`. Does NOT create — that's `tryAcquireComputerUseLock`'s job.
*/
export async function checkComputerUseLock(): Promise<CheckResult> {
const existing = await readLock()
if (!existing) return { kind: 'free' }
if (existing.sessionId === getSessionId()) return { kind: 'held_by_self' }
if (isProcessRunning(existing.pid)) {
return { kind: 'blocked', by: existing.sessionId }
}
logForDebugging(
`Recovering stale computer-use lock from session ${existing.sessionId} (PID ${existing.pid})`,
)
await unlink(getLockPath()).catch(() => {})
return { kind: 'free' }
}
/**
* Zero-syscall check: does THIS process believe it holds the lock?
* True iff `tryAcquireComputerUseLock` succeeded and `releaseComputerUseLock`
* hasn't run yet. Used to gate the per-turn release in `cleanup.ts` so
* non-CU turns don't touch disk.
*/
export function isLockHeldLocally(): boolean {
return unregisterCleanup !== undefined
}
/**
* Try to acquire the computer-use lock for the current session.
*
* `{kind: 'acquired', fresh: true}` — first tool call of a CU turn. Callers fire
* enter notifications on this. `{kind: 'acquired', fresh: false}` — re-entrant,
* same session already holds it. `{kind: 'blocked', by}` — another live session
* holds it.
*
* Uses O_EXCL (open 'wx') for atomic test-and-set — the OS guarantees at
* most one process sees the create succeed. If the file already exists,
* we check ownership and PID liveness; for a stale lock we unlink and
* retry the exclusive create once. If two sessions race to recover the
* same stale lock, only one create succeeds (the other reads the winner).
*/
export async function tryAcquireComputerUseLock(): Promise<AcquireResult> {
const sessionId = getSessionId()
const lock: ComputerUseLock = {
sessionId,
pid: process.pid,
acquiredAt: Date.now(),
}
await mkdir(getClaudeConfigHomeDir(), { recursive: true })
// Fresh acquisition.
if (await tryCreateExclusive(lock)) {
registerLockCleanup()
return FRESH
}
const existing = await readLock()
// Corrupt/unparseable — treat as stale (can't extract a blocking ID).
if (!existing) {
await unlink(getLockPath()).catch(() => {})
if (await tryCreateExclusive(lock)) {
registerLockCleanup()
return FRESH
}
return { kind: 'blocked', by: (await readLock())?.sessionId ?? 'unknown' }
}
// Already held by this session.
if (existing.sessionId === sessionId) return REENTRANT
// Another live session holds it — blocked.
if (isProcessRunning(existing.pid)) {
return { kind: 'blocked', by: existing.sessionId }
}
// Stale lock — recover. Unlink then retry the exclusive create.
// If another session is also recovering, one EEXISTs and reads the winner.
logForDebugging(
`Recovering stale computer-use lock from session ${existing.sessionId} (PID ${existing.pid})`,
)
await unlink(getLockPath()).catch(() => {})
if (await tryCreateExclusive(lock)) {
registerLockCleanup()
return FRESH
}
return { kind: 'blocked', by: (await readLock())?.sessionId ?? 'unknown' }
}
/**
* Release the computer-use lock if the current session owns it. Returns
* `true` if we actually unlinked the file (i.e., we held it) — callers fire
* exit notifications on this. Idempotent: subsequent calls return `false`.
*/
export async function releaseComputerUseLock(): Promise<boolean> {
unregisterCleanup?.()
unregisterCleanup = undefined
const existing = await readLock()
if (!existing || existing.sessionId !== getSessionId()) return false
try {
await unlink(getLockPath())
logForDebugging('Released computer-use lock')
return true
} catch {
return false
}
}
+79
View File
@@ -0,0 +1,79 @@
import { logForDebugging } from '../debug.js'
import { withResolvers } from '../withResolvers.js'
import { requireComputerUseSwift } from './swiftLoader.js'
/**
* Shared CFRunLoop pump. Swift's four `@MainActor` async methods
* (captureExcluding, captureRegion, apps.listInstalled, resolvePrepareCapture)
* and `@ant/computer-use-input`'s key()/keys() all dispatch to
* DispatchQueue.main. Under libuv (Node/bun) that queue never drains — the
* promises hang. Electron drains it via CFRunLoop so Cowork doesn't need this.
*
* One refcounted setInterval calls `_drainMainRunLoop` (RunLoop.main.run)
* every 1ms while any main-queue-dependent call is pending. Multiple
* concurrent drainRunLoop() calls share the single pump via retain/release.
*/
let pump: ReturnType<typeof setInterval> | undefined
let pending = 0
function drainTick(cu: ReturnType<typeof requireComputerUseSwift>): void {
cu._drainMainRunLoop()
}
function retain(): void {
pending++
if (pump === undefined) {
pump = setInterval(drainTick, 1, requireComputerUseSwift())
logForDebugging('[drainRunLoop] pump started', { level: 'verbose' })
}
}
function release(): void {
pending--
if (pending <= 0 && pump !== undefined) {
clearInterval(pump)
pump = undefined
logForDebugging('[drainRunLoop] pump stopped', { level: 'verbose' })
pending = 0
}
}
const TIMEOUT_MS = 30_000
function timeoutReject(reject: (e: Error) => void): void {
reject(new Error(`computer-use native call exceeded ${TIMEOUT_MS}ms`))
}
/**
* Hold a pump reference for the lifetime of a long-lived registration
* (e.g. the CGEventTap Escape handler). Unlike `drainRunLoop(fn)` this has
* no timeout — the caller is responsible for calling `releasePump()`. Same
* refcount as drainRunLoop calls, so nesting is safe.
*/
export const retainPump = retain
export const releasePump = release
/**
* Await `fn()` with the shared drain pump running. Safe to nest — multiple
* concurrent drainRunLoop() calls share one setInterval.
*/
export async function drainRunLoop<T>(fn: () => Promise<T>): Promise<T> {
retain()
let timer: ReturnType<typeof setTimeout> | undefined
try {
// If the timeout wins the race, fn()'s promise is orphaned — a late
// rejection from the native layer would become an unhandledRejection.
// Attaching a no-op catch swallows it; the timeout error is what surfaces.
// fn() sits inside try so a synchronous throw (e.g. NAPI argument
// validation) still reaches release() — otherwise the pump leaks.
const work = fn()
work.catch(() => {})
const timeout = withResolvers<never>()
timer = setTimeout(timeoutReject, TIMEOUT_MS, timeout.reject)
return await Promise.race([work, timeout.promise])
} finally {
clearTimeout(timer)
release()
}
}
+54
View File
@@ -0,0 +1,54 @@
import { logForDebugging } from '../debug.js'
import { releasePump, retainPump } from './drainRunLoop.js'
import { requireComputerUseSwift } from './swiftLoader.js'
/**
* Global Escape → abort. Mirrors Cowork's `escAbort.ts` but without Electron:
* CGEventTap via `@ant/computer-use-swift`. While registered, Escape is
* consumed system-wide (PI defense — a prompt-injected action can't dismiss
* a dialog with Escape).
*
* Lifecycle: register on fresh lock acquire (`wrapper.tsx` `acquireCuLock`),
* unregister on lock release (`cleanup.ts`). The tap's CFRunLoopSource sits
* in .defaultMode on CFRunLoopGetMain(), so we hold a drainRunLoop pump
* retain for the registration's lifetime — same refcounted setInterval as
* the `@MainActor` methods.
*
* `notifyExpectedEscape()` punches a hole for model-synthesized Escapes: the
* executor's `key("escape")` calls it before posting the CGEvent. Swift
* schedules a 100ms decay so a CGEvent that never reaches the tap callback
* doesn't eat the next user ESC.
*/
let registered = false
export function registerEscHotkey(onEscape: () => void): boolean {
if (registered) return true
const cu = requireComputerUseSwift()
if (!cu.hotkey.registerEscape(onEscape)) {
// CGEvent.tapCreate failed — typically missing Accessibility permission.
// CU still works, just without ESC abort. Mirrors Cowork's escAbort.ts:81.
logForDebugging('[cu-esc] registerEscape returned false', { level: 'warn' })
return false
}
retainPump()
registered = true
logForDebugging('[cu-esc] registered')
return true
}
export function unregisterEscHotkey(): void {
if (!registered) return
try {
requireComputerUseSwift().hotkey.unregister()
} finally {
releasePump()
registered = false
logForDebugging('[cu-esc] unregistered')
}
}
export function notifyExpectedEscape(): void {
if (!registered) return
requireComputerUseSwift().hotkey.notifyExpectedEscape()
}
+658
View File
@@ -0,0 +1,658 @@
/**
* CLI `ComputerExecutor` implementation. Wraps two native modules:
* - `@ant/computer-use-input` (Rust/enigo) — mouse, keyboard, frontmost app
* - `@ant/computer-use-swift` — SCContentFilter screenshots, NSWorkspace apps, TCC
*
* Contract: `packages/desktop/computer-use-mcp/src/executor.ts` in the apps
* repo. The reference impl is Cowork's `apps/desktop/src/main/nest-only/
* computer-use/executor.ts` — see notable deviations under "CLI deltas" below.
*
* ── CLI deltas from Cowork ─────────────────────────────────────────────────
*
* No `withClickThrough`. Cowork wraps every mouse op in
* `BrowserWindow.setIgnoreMouseEvents(true)` so clicks fall through the
* overlay. We're a terminal — no window — so the click-through bracket is
* a no-op. The sentinel `CLI_HOST_BUNDLE_ID` never matches frontmost.
*
* Terminal as surrogate host. `getTerminalBundleId()` detects the emulator
* we're running inside. It's passed as `hostBundleId` to `prepareDisplay`/
* `resolvePrepareCapture` so the Swift side exempts it from hide AND skips
* it in the activate z-order walk (so the terminal being frontmost doesn't
* eat clicks meant for the target app). Also stripped from `allowedBundleIds`
* via `withoutTerminal()` so screenshots don't capture it (Swift 0.2.1's
* captureExcluding takes an allow-list despite the name — apps#30355).
* `capabilities.hostBundleId` stays as the sentinel — the package's
* frontmost gate uses that, and the terminal being frontmost is fine.
*
* Clipboard via `pbcopy`/`pbpaste`. No Electron `clipboard` module.
*/
import type {
ComputerExecutor,
DisplayGeometry,
FrontmostApp,
InstalledApp,
ResolvePrepareCaptureResult,
RunningApp,
ScreenshotResult,
} from '@ant/computer-use-mcp'
import { API_RESIZE_PARAMS, targetImageSize } from '@ant/computer-use-mcp'
import { logForDebugging } from '../debug.js'
import { errorMessage } from '../errors.js'
import { execFileNoThrow } from '../execFileNoThrow.js'
import { sleep } from '../sleep.js'
import {
CLI_CU_CAPABILITIES,
CLI_HOST_BUNDLE_ID,
getTerminalBundleId,
} from './common.js'
import { drainRunLoop } from './drainRunLoop.js'
import { notifyExpectedEscape } from './escHotkey.js'
import { requireComputerUseInput } from './inputLoader.js'
import { requireComputerUseSwift } from './swiftLoader.js'
// ── Helpers ───────────────────────────────────────────────────────────────────
const SCREENSHOT_JPEG_QUALITY = 0.75
/** Logical → physical → API target dims. See `targetImageSize` + COORDINATES.md. */
function computeTargetDims(
logicalW: number,
logicalH: number,
scaleFactor: number,
): [number, number] {
const physW = Math.round(logicalW * scaleFactor)
const physH = Math.round(logicalH * scaleFactor)
return targetImageSize(physW, physH, API_RESIZE_PARAMS)
}
async function readClipboardViaPbpaste(): Promise<string> {
const { stdout, code } = await execFileNoThrow('pbpaste', [], {
useCwd: false,
})
if (code !== 0) {
throw new Error(`pbpaste exited with code ${code}`)
}
return stdout
}
async function writeClipboardViaPbcopy(text: string): Promise<void> {
const { code } = await execFileNoThrow('pbcopy', [], {
input: text,
useCwd: false,
})
if (code !== 0) {
throw new Error(`pbcopy exited with code ${code}`)
}
}
type Input = ReturnType<typeof requireComputerUseInput>
/**
* Single-element key sequence matching "escape" or "esc" (case-insensitive).
* Used to hole-punch the CGEventTap abort for model-synthesized Escape — enigo
* accepts both spellings, so the tap must too.
*/
function isBareEscape(parts: readonly string[]): boolean {
if (parts.length !== 1) return false
const lower = parts[0]!.toLowerCase()
return lower === 'escape' || lower === 'esc'
}
/**
* Instant move, then 50ms — an input→HID→AppKit→NSEvent round-trip before the
* caller reads `NSEvent.mouseLocation` or dispatches a click. Used for click,
* scroll, and drag-from; `animatedMove` is reserved for drag-to only. The
* intermediate animation frames were triggering hover states and, on the
* decomposed mouseDown/moveMouse path, emitting stray `.leftMouseDragged`
* events (toolCalls.ts handleScroll's mouse_full workaround).
*/
const MOVE_SETTLE_MS = 50
async function moveAndSettle(
input: Input,
x: number,
y: number,
): Promise<void> {
await input.moveMouse(x, y, false)
await sleep(MOVE_SETTLE_MS)
}
/**
* Release `pressed` in reverse (last pressed = first released). Errors are
* swallowed so a release failure never masks the real error.
*
* Drains via pop() rather than snapshotting length: if a drainRunLoop-
* orphaned press lambda resolves an in-flight input.key() AFTER finally
* calls us, that late push is still released on the next iteration. The
* orphaned flag stops the lambda at its NEXT check, not the current await.
*/
async function releasePressed(input: Input, pressed: string[]): Promise<void> {
let k: string | undefined
while ((k = pressed.pop()) !== undefined) {
try {
await input.key(k, 'release')
} catch {
// Swallow — best-effort release.
}
}
}
/**
* Bracket `fn()` with modifier press/release. `pressed` tracks which presses
* actually landed, so a mid-press throw only releases what was pressed — no
* stuck modifiers. The finally covers both press-phase and fn() throws.
*
* Caller must already be inside drainRunLoop() — key() dispatches to the
* main queue and needs the pump to resolve.
*/
async function withModifiers<T>(
input: Input,
mods: string[],
fn: () => Promise<T>,
): Promise<T> {
const pressed: string[] = []
try {
for (const m of mods) {
await input.key(m, 'press')
pressed.push(m)
}
return await fn()
} finally {
await releasePressed(input, pressed)
}
}
/**
* Port of Cowork's `typeViaClipboard`. Sequence:
* 1. Save the user's clipboard.
* 2. Write our text.
* 3. READ-BACK VERIFY — clipboard writes can silently fail. If the
* read-back doesn't match, never press Cmd+V (would paste junk).
* 4. Cmd+V via keys().
* 5. Sleep 100ms — battle-tested threshold for the paste-effect vs
* clipboard-restore race. Restoring too soon means the target app
* pastes the RESTORED content.
* 6. Restore — in a `finally`, so a throw between 2-5 never leaves the
* user's clipboard clobbered. Restore failures are swallowed.
*/
async function typeViaClipboard(input: Input, text: string): Promise<void> {
let saved: string | undefined
try {
saved = await readClipboardViaPbpaste()
} catch {
logForDebugging(
'[computer-use] pbpaste before paste failed; proceeding without restore',
)
}
try {
await writeClipboardViaPbcopy(text)
if ((await readClipboardViaPbpaste()) !== text) {
throw new Error('Clipboard write did not round-trip.')
}
await input.keys(['command', 'v'])
await sleep(100)
} finally {
if (typeof saved === 'string') {
try {
await writeClipboardViaPbcopy(saved)
} catch {
logForDebugging('[computer-use] clipboard restore after paste failed')
}
}
}
}
/**
* Port of Cowork's `animateMouseMovement` + `animatedMove`. Ease-out-cubic at
* 60fps; distance-proportional duration at 2000 px/sec, capped at 0.5s. When
* the sub-gate is off (or distance < ~2 frames), falls through to
* `moveAndSettle`. Called only from `drag` for the press→to motion — target
* apps may watch for `.leftMouseDragged` specifically (not just "button down +
* position changed") and the slow motion gives them time to process
* intermediate positions (scrollbars, window resizes).
*/
async function animatedMove(
input: Input,
targetX: number,
targetY: number,
mouseAnimationEnabled: boolean,
): Promise<void> {
if (!mouseAnimationEnabled) {
await moveAndSettle(input, targetX, targetY)
return
}
const start = await input.mouseLocation()
const deltaX = targetX - start.x
const deltaY = targetY - start.y
const distance = Math.hypot(deltaX, deltaY)
if (distance < 1) return
const durationSec = Math.min(distance / 2000, 0.5)
if (durationSec < 0.03) {
await moveAndSettle(input, targetX, targetY)
return
}
const frameRate = 60
const frameIntervalMs = 1000 / frameRate
const totalFrames = Math.floor(durationSec * frameRate)
for (let frame = 1; frame <= totalFrames; frame++) {
const t = frame / totalFrames
const eased = 1 - Math.pow(1 - t, 3)
await input.moveMouse(
Math.round(start.x + deltaX * eased),
Math.round(start.y + deltaY * eased),
false,
)
if (frame < totalFrames) {
await sleep(frameIntervalMs)
}
}
// Last frame has no trailing sleep — same HID round-trip before the
// caller's mouseButton reads NSEvent.mouseLocation.
await sleep(MOVE_SETTLE_MS)
}
// ── Factory ───────────────────────────────────────────────────────────────
export function createCliExecutor(opts: {
getMouseAnimationEnabled: () => boolean
getHideBeforeActionEnabled: () => boolean
}): ComputerExecutor {
if (process.platform !== 'darwin') {
throw new Error(
`createCliExecutor called on ${process.platform}. Computer control is macOS-only.`,
)
}
// Swift loaded once at factory time — every executor method needs it.
// Input loaded lazily via requireComputerUseInput() on first mouse/keyboard
// call — it caches internally, so screenshot-only flows never pull the
// enigo .node.
const cu = requireComputerUseSwift()
const { getMouseAnimationEnabled, getHideBeforeActionEnabled } = opts
const terminalBundleId = getTerminalBundleId()
const surrogateHost = terminalBundleId ?? CLI_HOST_BUNDLE_ID
// Swift 0.2.1's captureExcluding/captureRegion take an ALLOW list despite the
// name (apps#30355 — complement computed Swift-side against running apps).
// The terminal isn't in the user's grants so it's naturally excluded, but if
// the package ever passes it through we strip it here so the terminal never
// photobombs a screenshot.
const withoutTerminal = (allowed: readonly string[]): string[] =>
terminalBundleId === null
? [...allowed]
: allowed.filter(id => id !== terminalBundleId)
logForDebugging(
terminalBundleId
? `[computer-use] terminal ${terminalBundleId} → surrogate host (hide-exempt, activate-skip, screenshot-excluded)`
: '[computer-use] terminal not detected; falling back to sentinel host',
)
return {
capabilities: {
...CLI_CU_CAPABILITIES,
hostBundleId: CLI_HOST_BUNDLE_ID,
},
// ── Pre-action sequence (hide + defocus) ────────────────────────────
async prepareForAction(
allowlistBundleIds: string[],
displayId?: number,
): Promise<string[]> {
if (!getHideBeforeActionEnabled()) {
return []
}
// prepareDisplay isn't @MainActor (plain Task{}), but its .hide() calls
// trigger window-manager events that queue on CFRunLoop. Without the
// pump, those pile up during Swift's ~1s of usleeps and flush all at
// once when the next pumped call runs — visible window flashing.
// Electron drains CFRunLoop continuously so Cowork doesn't see this.
// Worst-case 100ms + 5×200ms safety-net ≈ 1.1s, well under the 30s
// drainRunLoop ceiling.
//
// "Continue with action execution even if switching fails" — the
// frontmost gate in toolCalls.ts catches any actual unsafe state.
return drainRunLoop(async () => {
try {
const result = await cu.apps.prepareDisplay(
allowlistBundleIds,
surrogateHost,
displayId,
)
if (result.activated) {
logForDebugging(
`[computer-use] prepareForAction: activated ${result.activated}`,
)
}
return result.hidden
} catch (err) {
logForDebugging(
`[computer-use] prepareForAction failed; continuing to action: ${errorMessage(err)}`,
{ level: 'warn' },
)
return []
}
})
},
async previewHideSet(
allowlistBundleIds: string[],
displayId?: number,
): Promise<Array<{ bundleId: string; displayName: string }>> {
return cu.apps.previewHideSet(
[...allowlistBundleIds, surrogateHost],
displayId,
)
},
// ── Display ──────────────────────────────────────────────────────────
async getDisplaySize(displayId?: number): Promise<DisplayGeometry> {
return cu.display.getSize(displayId)
},
async listDisplays(): Promise<DisplayGeometry[]> {
return cu.display.listAll()
},
async findWindowDisplays(
bundleIds: string[],
): Promise<Array<{ bundleId: string; displayIds: number[] }>> {
return cu.apps.findWindowDisplays(bundleIds)
},
async resolvePrepareCapture(opts: {
allowedBundleIds: string[]
preferredDisplayId?: number
autoResolve: boolean
doHide?: boolean
}): Promise<ResolvePrepareCaptureResult> {
const d = cu.display.getSize(opts.preferredDisplayId)
const [targetW, targetH] = computeTargetDims(
d.width,
d.height,
d.scaleFactor,
)
return drainRunLoop(() =>
cu.resolvePrepareCapture(
withoutTerminal(opts.allowedBundleIds),
surrogateHost,
SCREENSHOT_JPEG_QUALITY,
targetW,
targetH,
opts.preferredDisplayId,
opts.autoResolve,
opts.doHide,
),
)
},
/**
* Pre-size to `targetImageSize` output so the API transcoder's early-return
* fires — no server-side resize, `scaleCoord` stays coherent. See
* packages/desktop/computer-use-mcp/COORDINATES.md.
*/
async screenshot(opts: {
allowedBundleIds: string[]
displayId?: number
}): Promise<ScreenshotResult> {
const d = cu.display.getSize(opts.displayId)
const [targetW, targetH] = computeTargetDims(
d.width,
d.height,
d.scaleFactor,
)
return drainRunLoop(() =>
cu.screenshot.captureExcluding(
withoutTerminal(opts.allowedBundleIds),
SCREENSHOT_JPEG_QUALITY,
targetW,
targetH,
opts.displayId,
),
)
},
async zoom(
regionLogical: { x: number; y: number; w: number; h: number },
allowedBundleIds: string[],
displayId?: number,
): Promise<{ base64: string; width: number; height: number }> {
const d = cu.display.getSize(displayId)
const [outW, outH] = computeTargetDims(
regionLogical.w,
regionLogical.h,
d.scaleFactor,
)
return drainRunLoop(() =>
cu.screenshot.captureRegion(
withoutTerminal(allowedBundleIds),
regionLogical.x,
regionLogical.y,
regionLogical.w,
regionLogical.h,
outW,
outH,
SCREENSHOT_JPEG_QUALITY,
displayId,
),
)
},
// ── Keyboard ─────────────────────────────────────────────────────────
/**
* xdotool-style sequence e.g. "ctrl+shift+a" → split on '+' and pass to
* keys(). keys() dispatches to DispatchQueue.main — drainRunLoop pumps
* CFRunLoop so it resolves. Rust's error-path cleanup (enigo_wrap.rs)
* releases modifiers on each invocation, so a mid-loop throw leaves
* nothing stuck. 8ms between iterations — 125Hz USB polling cadence.
*/
async key(keySequence: string, repeat?: number): Promise<void> {
const input = requireComputerUseInput()
const parts = keySequence.split('+').filter(p => p.length > 0)
// Bare-only: the CGEventTap checks event.flags.isEmpty so ctrl+escape
// etc. pass through without aborting.
const isEsc = isBareEscape(parts)
const n = repeat ?? 1
await drainRunLoop(async () => {
for (let i = 0; i < n; i++) {
if (i > 0) {
await sleep(8)
}
if (isEsc) {
notifyExpectedEscape()
}
await input.keys(parts)
}
})
},
async holdKey(keyNames: string[], durationMs: number): Promise<void> {
const input = requireComputerUseInput()
// Press/release each wrapped in drainRunLoop; the sleep sits outside so
// durationMs isn't bounded by drainRunLoop's 30s timeout. `pressed`
// tracks which presses landed so a mid-press throw still releases
// everything that was actually pressed.
//
// `orphaned` guards against a timeout-orphan race: if the press-phase
// drainRunLoop times out while the esc-hotkey pump-retain keeps the
// pump running, the orphaned lambda would continue pushing to `pressed`
// after finally's releasePressed snapshotted the length — leaving keys
// stuck. The flag stops the lambda at the next iteration.
const pressed: string[] = []
let orphaned = false
try {
await drainRunLoop(async () => {
for (const k of keyNames) {
if (orphaned) return
// Bare Escape: notify the CGEventTap so it doesn't fire the
// abort callback for a model-synthesized press. Same as key().
if (isBareEscape([k])) {
notifyExpectedEscape()
}
await input.key(k, 'press')
pressed.push(k)
}
})
await sleep(durationMs)
} finally {
orphaned = true
await drainRunLoop(() => releasePressed(input, pressed))
}
},
async type(text: string, opts: { viaClipboard: boolean }): Promise<void> {
const input = requireComputerUseInput()
if (opts.viaClipboard) {
// keys(['command','v']) inside needs the pump.
await drainRunLoop(() => typeViaClipboard(input, text))
return
}
// `toolCalls.ts` handles the grapheme loop + 8ms sleeps and calls this
// once per grapheme. typeText doesn't dispatch to the main queue.
await input.typeText(text)
},
readClipboard: readClipboardViaPbpaste,
writeClipboard: writeClipboardViaPbcopy,
// ── Mouse ────────────────────────────────────────────────────────────
async moveMouse(x: number, y: number): Promise<void> {
await moveAndSettle(requireComputerUseInput(), x, y)
},
/**
* Move, then click. Modifiers are press/release bracketed via withModifiers
* — same pattern as Cowork. AppKit computes NSEvent.clickCount from timing
* + position proximity, so double/triple click work without setting the
* CGEvent clickState field. key() inside withModifiers needs the pump;
* the modifier-less path doesn't.
*/
async click(
x: number,
y: number,
button: 'left' | 'right' | 'middle',
count: 1 | 2 | 3,
modifiers?: string[],
): Promise<void> {
const input = requireComputerUseInput()
await moveAndSettle(input, x, y)
if (modifiers && modifiers.length > 0) {
await drainRunLoop(() =>
withModifiers(input, modifiers, () =>
input.mouseButton(button, 'click', count),
),
)
} else {
await input.mouseButton(button, 'click', count)
}
},
async mouseDown(): Promise<void> {
await requireComputerUseInput().mouseButton('left', 'press')
},
async mouseUp(): Promise<void> {
await requireComputerUseInput().mouseButton('left', 'release')
},
async getCursorPosition(): Promise<{ x: number; y: number }> {
return requireComputerUseInput().mouseLocation()
},
/**
* `from === undefined` → drag from current cursor (training's
* left_click_drag with start_coordinate omitted). Inner `finally`: the
* button is ALWAYS released even if the move throws — otherwise the
* user's left button is stuck-pressed until they physically click.
* 50ms sleep after press: enigo's move_mouse reads NSEvent.pressedMouseButtons
* to decide .leftMouseDragged vs .mouseMoved; the synthetic leftMouseDown
* needs a HID-tap round-trip to show up there.
*/
async drag(
from: { x: number; y: number } | undefined,
to: { x: number; y: number },
): Promise<void> {
const input = requireComputerUseInput()
if (from !== undefined) {
await moveAndSettle(input, from.x, from.y)
}
await input.mouseButton('left', 'press')
await sleep(MOVE_SETTLE_MS)
try {
await animatedMove(input, to.x, to.y, getMouseAnimationEnabled())
} finally {
await input.mouseButton('left', 'release')
}
},
/**
* Move first, then scroll each axis. Vertical-first — it's the common
* axis; a horizontal failure shouldn't lose the vertical.
*/
async scroll(x: number, y: number, dx: number, dy: number): Promise<void> {
const input = requireComputerUseInput()
await moveAndSettle(input, x, y)
if (dy !== 0) {
await input.mouseScroll(dy, 'vertical')
}
if (dx !== 0) {
await input.mouseScroll(dx, 'horizontal')
}
},
// ── App management ───────────────────────────────────────────────────
async getFrontmostApp(): Promise<FrontmostApp | null> {
const info = requireComputerUseInput().getFrontmostAppInfo()
if (!info || !info.bundleId) return null
return { bundleId: info.bundleId, displayName: info.appName }
},
async appUnderPoint(
x: number,
y: number,
): Promise<{ bundleId: string; displayName: string } | null> {
return cu.apps.appUnderPoint(x, y)
},
async listInstalledApps(): Promise<InstalledApp[]> {
// `ComputerUseInstalledApp` is `{bundleId, displayName, path}`.
// `InstalledApp` adds optional `iconDataUrl` — left unpopulated;
// the approval dialog fetches lazily via getAppIcon() below.
return drainRunLoop(() => cu.apps.listInstalled())
},
async getAppIcon(path: string): Promise<string | undefined> {
return cu.apps.iconDataUrl(path) ?? undefined
},
async listRunningApps(): Promise<RunningApp[]> {
return cu.apps.listRunning()
},
async openApp(bundleId: string): Promise<void> {
await cu.apps.open(bundleId)
},
}
}
/**
* Module-level export (not on the executor object) — called at turn-end from
* `stopHooks.ts` / `query.ts`, outside the executor lifecycle. Fire-and-forget
* at the call site; the caller `.catch()`es.
*/
export async function unhideComputerUseApps(
bundleIds: readonly string[],
): Promise<void> {
if (bundleIds.length === 0) return
const cu = requireComputerUseSwift()
await cu.apps.unhide([...bundleIds])
}
+72
View File
@@ -0,0 +1,72 @@
import type { CoordinateMode, CuSubGates } from '@ant/computer-use-mcp/types'
import { getDynamicConfig_CACHED_MAY_BE_STALE } from '../../services/analytics/growthbook.js'
import { getSubscriptionType } from '../auth.js'
import { isEnvTruthy } from '../envUtils.js'
type ChicagoConfig = CuSubGates & {
enabled: boolean
coordinateMode: CoordinateMode
}
const DEFAULTS: ChicagoConfig = {
enabled: false,
pixelValidation: false,
clipboardPasteMultiline: true,
mouseAnimation: true,
hideBeforeAction: true,
autoTargetDisplay: true,
clipboardGuard: true,
coordinateMode: 'pixels',
}
// Spread over defaults so a partial JSON ({"enabled": true} alone) inherits the
// rest. The generic on getDynamicConfig is a type assertion, not a validator —
// GB returning a partial object would otherwise surface undefined fields.
function readConfig(): ChicagoConfig {
return {
...DEFAULTS,
...getDynamicConfig_CACHED_MAY_BE_STALE<Partial<ChicagoConfig>>(
'tengu_malort_pedway',
DEFAULTS,
),
}
}
// Max/Pro only for external rollout. Ant bypass so dogfooding continues
// regardless of subscription tier — not all ants are max/pro, and per
// CLAUDE.md:281, USER_TYPE !== 'ant' branches get zero antfooding.
function hasRequiredSubscription(): boolean {
if (process.env.USER_TYPE === 'ant') return true
const tier = getSubscriptionType()
return tier === 'max' || tier === 'pro'
}
export function getChicagoEnabled(): boolean {
// Disable for ants whose shell inherited monorepo dev config.
// MONOREPO_ROOT_DIR is exported by config/local/zsh/zshrc, which
// laptop-setup.sh wires into ~/.zshrc — its presence is the cheap
// proxy for "has monorepo access". Override: ALLOW_ANT_COMPUTER_USE_MCP=1.
if (
process.env.USER_TYPE === 'ant' &&
process.env.MONOREPO_ROOT_DIR &&
!isEnvTruthy(process.env.ALLOW_ANT_COMPUTER_USE_MCP)
) {
return false
}
return hasRequiredSubscription() && readConfig().enabled
}
export function getChicagoSubGates(): CuSubGates {
const { enabled: _e, coordinateMode: _c, ...subGates } = readConfig()
return subGates
}
// Frozen at first read — setup.ts builds tool descriptions and executor.ts
// scales coordinates off the same value. A live read here lets a mid-session
// GB flip tell the model "pixels" while transforming clicks as normalized.
let frozenCoordinateMode: CoordinateMode | undefined
export function getChicagoCoordinateMode(): CoordinateMode {
frozenCoordinateMode ??= readConfig().coordinateMode
return frozenCoordinateMode
}
+69
View File
@@ -0,0 +1,69 @@
import type {
ComputerUseHostAdapter,
Logger,
} from '@ant/computer-use-mcp/types'
import { format } from 'util'
import { logForDebugging } from '../debug.js'
import { COMPUTER_USE_MCP_SERVER_NAME } from './common.js'
import { createCliExecutor } from './executor.js'
import { getChicagoEnabled, getChicagoSubGates } from './gates.js'
import { requireComputerUseSwift } from './swiftLoader.js'
class DebugLogger implements Logger {
silly(message: string, ...args: unknown[]): void {
logForDebugging(format(message, ...args), { level: 'debug' })
}
debug(message: string, ...args: unknown[]): void {
logForDebugging(format(message, ...args), { level: 'debug' })
}
info(message: string, ...args: unknown[]): void {
logForDebugging(format(message, ...args), { level: 'info' })
}
warn(message: string, ...args: unknown[]): void {
logForDebugging(format(message, ...args), { level: 'warn' })
}
error(message: string, ...args: unknown[]): void {
logForDebugging(format(message, ...args), { level: 'error' })
}
}
let cached: ComputerUseHostAdapter | undefined
/**
* Process-lifetime singleton. Built once on first CU tool call; native modules
* (both `@ant/computer-use-input` and `@ant/computer-use-swift`) are loaded
* here via the executor factory, which throws on load failure — there is no
* degraded mode.
*/
export function getComputerUseHostAdapter(): ComputerUseHostAdapter {
if (cached) return cached
cached = {
serverName: COMPUTER_USE_MCP_SERVER_NAME,
logger: new DebugLogger(),
executor: createCliExecutor({
getMouseAnimationEnabled: () => getChicagoSubGates().mouseAnimation,
getHideBeforeActionEnabled: () => getChicagoSubGates().hideBeforeAction,
}),
ensureOsPermissions: async () => {
const cu = requireComputerUseSwift()
const accessibility = cu.tcc.checkAccessibility()
const screenRecording = cu.tcc.checkScreenRecording()
return accessibility && screenRecording
? { granted: true }
: { granted: false, accessibility, screenRecording }
},
isDisabled: () => !getChicagoEnabled(),
getSubGates: getChicagoSubGates,
// cleanup.ts always unhides at turn end — no user preference to disable it.
getAutoUnhideEnabled: () => true,
// Pixel-validation JPEG decode+crop. MUST be synchronous (the package
// does `patch1.equals(patch2)` directly on the return value). Cowork uses
// Electron's `nativeImage` (sync); our `image-processor-napi` is
// sharp-compatible and async-only. Returning null → validation skipped,
// click proceeds — the designed fallback per `PixelCompareResult.skipped`.
// The sub-gate defaults to false anyway.
cropRawPatch: () => null,
}
return cached
}
+30
View File
@@ -0,0 +1,30 @@
import type {
ComputerUseInput,
ComputerUseInputAPI,
} from '@ant/computer-use-input'
let cached: ComputerUseInputAPI | undefined
/**
* Package's js/index.js reads COMPUTER_USE_INPUT_NODE_PATH (baked by
* build-with-plugins.ts on darwin targets, unset otherwise — falls through to
* the node_modules prebuilds/ path).
*
* The package exports a discriminated union on `isSupported` — narrowed here
* once so callers get the bare `ComputerUseInputAPI` without re-checking.
*
* key()/keys() dispatch enigo work onto DispatchQueue.main via
* dispatch2::run_on_main, then block a tokio worker on a channel. Under
* Electron (CFRunLoop drains the main queue) this works; under libuv
* (Node/bun) the main queue never drains and the promise hangs. The executor
* calls these inside drainRunLoop().
*/
export function requireComputerUseInput(): ComputerUseInputAPI {
if (cached) return cached
// eslint-disable-next-line @typescript-eslint/no-require-imports
const input = require('@ant/computer-use-input') as ComputerUseInput
if (!input.isSupported) {
throw new Error('@ant/computer-use-input is not supported on this platform')
}
return (cached = input)
}
+106
View File
@@ -0,0 +1,106 @@
import {
buildComputerUseTools,
createComputerUseMcpServer,
} from '@ant/computer-use-mcp'
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
import { ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js'
import { homedir } from 'os'
import { shutdownDatadog } from '../../services/analytics/datadog.js'
import { shutdown1PEventLogging } from '../../services/analytics/firstPartyEventLogger.js'
import { initializeAnalyticsSink } from '../../services/analytics/sink.js'
import { enableConfigs } from '../config.js'
import { logForDebugging } from '../debug.js'
import { filterAppsForDescription } from './appNames.js'
import { getChicagoCoordinateMode } from './gates.js'
import { getComputerUseHostAdapter } from './hostAdapter.js'
const APP_ENUM_TIMEOUT_MS = 1000
/**
* Enumerate installed apps, timed. Fails soft — if Spotlight is slow or
* claude-swift throws, the tool description just omits the list. Resolution
* happens at call time regardless; the model just doesn't get hints.
*/
async function tryGetInstalledAppNames(): Promise<string[] | undefined> {
const adapter = getComputerUseHostAdapter()
const enumP = adapter.executor.listInstalledApps()
let timer: ReturnType<typeof setTimeout> | undefined
const timeoutP = new Promise<undefined>(resolve => {
timer = setTimeout(resolve, APP_ENUM_TIMEOUT_MS, undefined)
})
const installed = await Promise.race([enumP, timeoutP])
.catch(() => undefined)
.finally(() => clearTimeout(timer))
if (!installed) {
// The enumeration continues in the background — swallow late rejections.
void enumP.catch(() => {})
logForDebugging(
`[Computer Use MCP] app enumeration exceeded ${APP_ENUM_TIMEOUT_MS}ms or failed; tool description omits list`,
)
return undefined
}
return filterAppsForDescription(installed, homedir())
}
/**
* Construct the in-process server. Delegates to the package's
* `createComputerUseMcpServer` for the Server object + stub CallTool handler,
* then REPLACES the ListTools handler with one that includes installed-app
* names in the `request_access` description (the package's factory doesn't
* take `installedAppNames`, and Cowork builds its own tool array in
* serverDef.ts for the same reason).
*
* Async so the 1s app-enumeration timeout doesn't block startup — called from
* an `await import()` in `client.ts` on first CU connection, not `main.tsx`.
*
* Real dispatch still goes through `wrapper.tsx`'s `.call()` override; this
* server exists only to answer ListTools.
*/
export async function createComputerUseMcpServerForCli(): Promise<
ReturnType<typeof createComputerUseMcpServer>
> {
const adapter = getComputerUseHostAdapter()
const coordinateMode = getChicagoCoordinateMode()
const server = createComputerUseMcpServer(adapter, coordinateMode)
const installedAppNames = await tryGetInstalledAppNames()
const tools = buildComputerUseTools(
adapter.executor.capabilities,
coordinateMode,
installedAppNames,
)
server.setRequestHandler(ListToolsRequestSchema, async () =>
adapter.isDisabled() ? { tools: [] } : { tools },
)
return server
}
/**
* Subprocess entrypoint for `--computer-use-mcp`. Mirror of
* `runClaudeInChromeMcpServer` — stdio transport, exit on stdin close,
* flush analytics before exit.
*/
export async function runComputerUseMcpServer(): Promise<void> {
enableConfigs()
initializeAnalyticsSink()
const server = await createComputerUseMcpServerForCli()
const transport = new StdioServerTransport()
let exiting = false
const shutdownAndExit = async (): Promise<void> => {
if (exiting) return
exiting = true
await Promise.all([shutdown1PEventLogging(), shutdownDatadog()])
// eslint-disable-next-line custom-rules/no-process-exit
process.exit(0)
}
process.stdin.on('end', () => void shutdownAndExit())
process.stdin.on('error', () => void shutdownAndExit())
logForDebugging('[Computer Use MCP] Starting MCP server')
await server.connect(transport)
logForDebugging('[Computer Use MCP] MCP server started')
}
+53
View File
@@ -0,0 +1,53 @@
import { buildComputerUseTools } from '@ant/computer-use-mcp'
import { join } from 'path'
import { fileURLToPath } from 'url'
import { buildMcpToolName } from '../../services/mcp/mcpStringUtils.js'
import type { ScopedMcpServerConfig } from '../../services/mcp/types.js'
import { isInBundledMode } from '../bundledMode.js'
import { CLI_CU_CAPABILITIES, COMPUTER_USE_MCP_SERVER_NAME } from './common.js'
import { getChicagoCoordinateMode } from './gates.js'
/**
* Build the dynamic MCP config + allowed tool names. Mirror of
* `setupClaudeInChrome`. The `mcp__computer-use__*` tools are added to
* `allowedTools` so they bypass the normal permission prompt — the package's
* `request_access` handles approval for the whole session.
*
* The MCP layer isn't ceremony: the API backend detects `mcp__computer-use__*`
* tool names and emits a CU availability hint into the system prompt
* (COMPUTER_USE_MCP_AVAILABILITY_HINT in the anthropic repo). Built-in tools
* with different names wouldn't trigger it. Cowork uses the same names for the
* same reason (apps/desktop/src/main/local-agent-mode/systemPrompt.ts:314).
*/
export function setupComputerUseMCP(): {
mcpConfig: Record<string, ScopedMcpServerConfig>
allowedTools: string[]
} {
const allowedTools = buildComputerUseTools(
CLI_CU_CAPABILITIES,
getChicagoCoordinateMode(),
).map(t => buildMcpToolName(COMPUTER_USE_MCP_SERVER_NAME, t.name))
// command/args are never spawned — client.ts intercepts by name and
// uses the in-process server. The config just needs to exist with
// type 'stdio' to hit the right branch. Mirrors Chrome's setup.
const args = isInBundledMode()
? ['--computer-use-mcp']
: [
join(fileURLToPath(import.meta.url), '..', 'cli.js'),
'--computer-use-mcp',
]
return {
mcpConfig: {
[COMPUTER_USE_MCP_SERVER_NAME]: {
type: 'stdio',
command: process.execPath,
args,
scope: 'dynamic',
} as const,
},
allowedTools,
}
}
+23
View File
@@ -0,0 +1,23 @@
import type { ComputerUseAPI } from '@ant/computer-use-swift'
let cached: ComputerUseAPI | undefined
/**
* Package's js/index.js reads COMPUTER_USE_SWIFT_NODE_PATH (baked by
* build-with-plugins.ts on darwin targets, unset otherwise — falls through to
* the node_modules prebuilds/ path). We cache the loaded native module.
*
* The four @MainActor methods (captureExcluding, captureRegion,
* apps.listInstalled, resolvePrepareCapture) dispatch to DispatchQueue.main
* and will hang under libuv unless CFRunLoop is pumped — call sites wrap
* these in drainRunLoop().
*/
export function requireComputerUseSwift(): ComputerUseAPI {
if (process.platform !== 'darwin') {
throw new Error('@ant/computer-use-swift is macOS-only')
}
// eslint-disable-next-line @typescript-eslint/no-require-imports
return (cached ??= require('@ant/computer-use-swift') as ComputerUseAPI)
}
export type { ComputerUseAPI }
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long