Commit ccc9192e authored by edy's avatar edy

Fix bundled runtime gateway startup recovery

parent 3d1dc1d7
......@@ -21,7 +21,7 @@ export interface ChatGatewayRecoveryCoordinator {
) => Promise<RuntimeStatus>;
shouldRefreshGatewayClient: (config?: AppConfig, inputToken?: string) => Promise<boolean>;
reconfigureGatewayClient: (config?: AppConfig, inputToken?: string) => Promise<void>;
connectGatewayClientWithRetry: () => Promise<void>;
connectGatewayClientWithRetry: (mode?: "external" | "bundled") => Promise<void>;
}
function isManagedRuntimeMode(config: AppConfig): boolean {
......@@ -74,7 +74,7 @@ export async function ensureGatewayReadyForChat(
let gatewayStatus = await coordinator.gatewayStatus();
if (!isGatewayConnected(gatewayStatus) || shouldRefreshGatewayClient) {
await coordinator.connectGatewayClientWithRetry();
await coordinator.connectGatewayClientWithRetry(managedRuntimeMode ? "bundled" : "external");
gatewayStatus = await coordinator.gatewayStatus();
}
......@@ -95,7 +95,7 @@ export async function recoverGatewayForChat(
): Promise<void> {
const config = await coordinator.getConfig();
await coordinator.reconfigureGatewayClient(config, inputToken);
await coordinator.connectGatewayClientWithRetry();
await coordinator.connectGatewayClientWithRetry(isManagedRuntimeMode(config) ? "bundled" : "external");
const runtimeStatus = await coordinator.runtimeStatus();
if (isManagedRuntimeMode(config) && runtimeStatus.processState !== "running") {
......
......@@ -480,8 +480,11 @@ function buildProjectSyncSummary(message: string): Pick<WorkspaceSummary, "chatR
const MANAGED_RUNTIME_START_RETRY_LIMIT = 2;
const MANAGED_RUNTIME_START_RETRY_DELAY_MS = 1500;
const GATEWAY_CONNECT_RETRY_LIMIT = 10;
const GATEWAY_CONNECT_RETRY_LIMIT = 2;
const GATEWAY_CONNECT_RETRY_DELAY_MS = 1000;
const BUNDLED_GATEWAY_CONNECT_RETRY_LIMIT = 4;
const BUNDLED_GATEWAY_CONNECT_TIMEOUT_MS = 4000;
const EXTERNAL_GATEWAY_CONNECT_TIMEOUT_MS = 5000;
const BOOTSTRAP_RECOVERY_RETRY_LIMIT = 2;
const BOOTSTRAP_RECOVERY_RETRY_DELAY_MS = 2000;
......@@ -920,24 +923,28 @@ export function registerDesktopIpc(services: MainServices): RegisteredDesktopIpc
);
};
const connectGatewayClient = async (): Promise<void> => {
type GatewayConnectRetryMode = "external" | "bundled";
const connectGatewayClient = async (timeoutMs = EXTERNAL_GATEWAY_CONNECT_TIMEOUT_MS): Promise<void> => {
const status = await gatewayClient.status().catch(() => null);
if (status?.state === "connected") {
return;
}
await gatewayClient.reconnect().catch(() => gatewayClient.connect());
await gatewayClient.reconnect(timeoutMs).catch(() => gatewayClient.connect(timeoutMs));
};
const connectGatewayClientWithRetry = async (): Promise<void> => {
const connectGatewayClientWithRetry = async (mode: GatewayConnectRetryMode = "external"): Promise<void> => {
const retryLimit = mode === "bundled" ? BUNDLED_GATEWAY_CONNECT_RETRY_LIMIT : GATEWAY_CONNECT_RETRY_LIMIT;
const timeoutMs = mode === "bundled" ? BUNDLED_GATEWAY_CONNECT_TIMEOUT_MS : EXTERNAL_GATEWAY_CONNECT_TIMEOUT_MS;
let lastError: unknown;
for (let attempt = 1; attempt <= GATEWAY_CONNECT_RETRY_LIMIT; attempt += 1) {
for (let attempt = 1; attempt <= retryLimit; attempt += 1) {
try {
await connectGatewayClient();
await connectGatewayClient(timeoutMs);
return;
} catch (error) {
lastError = error;
if (attempt >= GATEWAY_CONNECT_RETRY_LIMIT) {
if (attempt >= retryLimit) {
break;
}
await delay(GATEWAY_CONNECT_RETRY_DELAY_MS);
......@@ -1046,7 +1053,7 @@ export function registerDesktopIpc(services: MainServices): RegisteredDesktopIpc
await runtimeCloudSupervisor.stop(reason);
if (await shouldRefreshGatewayClient(nextConfig, options.inputToken)) {
await reconfigureGatewayClient(nextConfig, options.inputToken);
await connectGatewayClientWithRetry().catch(() => undefined);
await connectGatewayClientWithRetry("external").catch(() => undefined);
}
return runtimeManager.status();
}
......@@ -1067,9 +1074,9 @@ export function registerDesktopIpc(services: MainServices): RegisteredDesktopIpc
status = await runtimeManager.start();
}
if (status.processState !== "error" && await shouldRefreshGatewayClient(nextConfig, options.inputToken)) {
if (status.processState === "running" && await shouldRefreshGatewayClient(nextConfig, options.inputToken)) {
await reconfigureGatewayClient(nextConfig, options.inputToken);
await connectGatewayClientWithRetry().catch(() => undefined);
await connectGatewayClientWithRetry("bundled").catch(() => undefined);
}
if (canUseRuntimeCloudConfig) {
await syncRuntimeCloudSupervisor(reason);
......
......@@ -218,7 +218,7 @@ export class GatewayClient {
this.appendLog("info", `Gateway client reconfigured for ${this.url}.`);
}
async connect(): Promise<GatewayStatus> {
async connect(timeoutMs = 7000): Promise<GatewayStatus> {
if (this.websocket && this.websocket.readyState === WebSocket.OPEN && this.statusSnapshot.state === "connected") {
return this.status();
}
......@@ -230,18 +230,22 @@ export class GatewayClient {
return new Promise<GatewayStatus>((resolve, reject) => {
let settled = false;
let suppressNextClose = false;
let ws: WebSocket | undefined;
const connectTimeout = setTimeout(() => {
const error = new Error(`Timed out while connecting to ${this.url}.`);
this.failConnection(error.message);
suppressNextClose = true;
this.resetTransport(ws);
cleanup();
reject(error);
}, 7000);
}, timeoutMs);
const cleanup = () => {
clearTimeout(connectTimeout);
};
const ws = new WebSocket(this.url);
ws = new WebSocket(this.url);
this.websocket = ws;
ws.on("open", () => {
......@@ -258,7 +262,10 @@ export class GatewayClient {
}
if (frame.type === "res") {
const connectResponse = this.handleConnectResponse(frame, cleanup, resolve, reject);
const connectResponse = this.handleConnectResponse(frame, cleanup, resolve, reject, () => {
suppressNextClose = true;
this.resetTransport(ws);
});
if (connectResponse) {
settled = true;
return;
......@@ -271,6 +278,8 @@ export class GatewayClient {
this.appendLog("warn", `Failed to handle Gateway frame: ${message}`);
if (!settled && this.handshakeSent) {
this.failConnection(message);
suppressNextClose = true;
this.resetTransport(ws);
cleanup();
reject(new Error(message));
}
......@@ -278,17 +287,23 @@ export class GatewayClient {
});
ws.on("close", (code: number) => {
if (suppressNextClose) {
suppressNextClose = false;
this.resetTransport(ws);
return;
}
if (this.intentionalClose) {
this.statusSnapshot = this.createStatus("disconnected", "Gateway disconnected.");
this.appendLog("info", `Gateway connection closed intentionally (${code}).`);
this.websocket = undefined;
this.resetTransport(ws);
return;
}
const message = settled
? `Gateway connection closed (${code}).`
: `Gateway closed during connect (${code}).`;
this.websocket = undefined;
this.resetTransport(ws);
this.failConnection(message);
if (!settled) {
cleanup();
......@@ -300,6 +315,7 @@ export class GatewayClient {
const message = `Failed to connect to ${this.url}. Check that OpenClaw Gateway is reachable.`;
this.failConnection(message);
if (!settled) {
this.resetTransport(ws);
cleanup();
reject(new Error(message));
}
......@@ -325,15 +341,16 @@ export class GatewayClient {
this.websocket.close();
this.websocket = undefined;
}
this.handshakeSent = false;
this.statusSnapshot = this.createStatus("disconnected", "Gateway disconnected.");
this.appendLog("info", "Gateway connection closed by desktop app.");
return this.status();
}
async reconnect(): Promise<GatewayStatus> {
async reconnect(timeoutMs = 7000): Promise<GatewayStatus> {
await this.disconnect();
return this.connect();
return this.connect(timeoutMs);
}
async status(): Promise<GatewayStatus> {
......@@ -551,7 +568,8 @@ export class GatewayClient {
frame: Record<string, unknown>,
cleanup: () => void,
resolve: (value: GatewayStatus) => void,
reject: (reason?: unknown) => void
reject: (reason?: unknown) => void,
closeTransport: () => void
): boolean {
if (frame.id !== "1") {
return false;
......@@ -560,6 +578,7 @@ export class GatewayClient {
if (frame.ok === false) {
const message = this.formatGatewayError(frame.error as GatewayErrorShape | undefined);
this.failConnection(message);
closeTransport();
cleanup();
reject(new Error(message));
return true;
......@@ -1028,6 +1047,20 @@ export class GatewayClient {
this.appendLog("error", message);
}
private resetTransport(ws?: WebSocket): void {
if (!ws || this.websocket === ws) {
this.websocket = undefined;
}
this.handshakeSent = false;
try {
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) {
ws.close();
}
} catch {
// Ignore transport cleanup errors while reporting the original connection failure.
}
}
private flattenContent(content?: Array<{ type?: string; text?: string }>): string {
if (!Array.isArray(content)) {
return "";
......@@ -1155,10 +1188,3 @@ export class GatewayClient {
}
}
......@@ -230,11 +230,36 @@ function formatGatewayProbeError(error: GatewayProbeErrorShape | undefined): str
return parts.join(" | ") || "Gateway rejected the readiness probe.";
}
function classifyMacSecurityBlockMessage(message?: string): string | undefined {
if (!message) {
return undefined;
}
const normalized = message.toLowerCase();
if (normalized.includes("macsecurityblock")) {
return trimTrailingPunctuation(message) + ".";
}
const blocked = normalized.includes("operation not permitted")
|| normalized.includes("code signature")
|| normalized.includes("codesign")
|| normalized.includes("quarantine")
|| normalized.includes("cannot be opened")
|| normalized.includes("is damaged")
|| normalized.includes("malware")
|| normalized.includes("not opened because apple cannot check it");
if (!blocked) {
return undefined;
}
return `macSecurityBlock: macOS blocked bundled runtime execution (${trimTrailingPunctuation(message)}).`;
}
async function probeGatewayReadiness(
url: string,
token?: string,
timeoutMs = GATEWAY_PROBE_TIMEOUT_MS,
options: { requireStatusHealth?: boolean } = {}
options: { requireStatusHealth?: boolean; probeLabel?: "bundled" | "reusable" } = {}
): Promise<GatewayProbeResult> {
return new Promise<GatewayProbeResult>((resolve) => {
let settled = false;
......@@ -269,7 +294,7 @@ async function probeGatewayReadiness(
finish({
ready: false,
lastError: options.requireStatusHealth
? `Timed out while probing reusable Gateway status and health at ${url}.`
? `Timed out while probing ${options.probeLabel ?? "reusable"} Gateway status and health at ${url}.`
: `Timed out while probing bundled Gateway readiness at ${url}.`
});
}, timeoutMs);
......@@ -373,6 +398,17 @@ async function probeGatewayReadiness(
return;
}
const payload = (frame.payload ?? frame.result ?? {}) as { ok?: unknown; details?: unknown };
if (payload.ok === false) {
finish({
ready: false,
lastError: typeof payload.details === "string" && payload.details.trim()
? `Gateway health probe reported not ok: ${payload.details.trim()}`
: "Gateway health probe reported not ok."
});
return;
}
finish({
ready: true,
version: gatewayVersion,
......@@ -429,11 +465,16 @@ function isGatewayProbeStartupTransient(message?: string): boolean {
}
const normalized = message.toLowerCase();
return normalized.includes("gateway closed during readiness probe")
return isGatewayProbeNoListener(message)
|| normalized.includes("gateway closed during readiness probe")
|| normalized.includes("gateway closed before readiness probe completed")
|| normalized.includes("gateway closed before health probe completed")
|| normalized.includes("timed out while probing reusable gateway")
|| normalized.includes("timed out while probing bundled gateway");
|| normalized.includes("timed out while probing bundled gateway")
|| normalized.includes("gateway health probe reported not ok")
|| normalized.includes("gateway starting; retry shortly")
|| normalized.includes("startup-sidecars")
|| normalized.includes("unavailable");
}
function formatPayloadIssue(
......@@ -1118,7 +1159,8 @@ export class RuntimeManager extends EventEmitter {
try {
child = spawn(paths.nodeExecutable, childArgs, spawnOptions);
} catch (error) {
this.lastError = `Bundled runtime failed to spawn: ${error instanceof Error ? error.message : String(error)}`;
const message = error instanceof Error ? error.message : String(error);
this.lastError = classifyMacSecurityBlockMessage(message) ?? `Bundled runtime failed to spawn: ${message}`;
this.appendLog("error", this.lastError);
this.refreshStatus("error");
return this.status();
......@@ -1138,7 +1180,7 @@ export class RuntimeManager extends EventEmitter {
}
});
child.once("error", (error) => {
this.lastError = `Bundled runtime failed to start: ${error.message}`;
this.lastError = classifyMacSecurityBlockMessage(error.message) ?? `Bundled runtime failed to start: ${error.message}`;
this.lastStoppedAt = new Date().toISOString();
this.child = undefined;
this.managedChildPid = undefined;
......@@ -1155,7 +1197,8 @@ export class RuntimeManager extends EventEmitter {
this.reusedExistingGateway = false;
if (!wasStopping && code !== 0) {
const stderrHint = this.buildStderrHint();
this.lastError = `Bundled runtime exited unexpectedly with code ${code ?? "unknown"}${signal ? ` (${signal})` : ""}${stderrHint ? `: ${stderrHint}` : ""}.`;
const exitMessage = `Bundled runtime exited unexpectedly with code ${code ?? "unknown"}${signal ? ` (${signal})` : ""}${stderrHint ? `: ${stderrHint}` : ""}.`;
this.lastError = classifyMacSecurityBlockMessage(stderrHint) ?? exitMessage;
this.appendLog("error", this.lastError);
this.refreshStatus("error");
return;
......@@ -1306,12 +1349,22 @@ export class RuntimeManager extends EventEmitter {
};
}
const probe = await probeGatewayReadiness(this.gatewayConnection.url, this.gatewayConnection.token);
const probe = await probeGatewayReadiness(this.gatewayConnection.url, this.gatewayConnection.token, GATEWAY_PROBE_TIMEOUT_MS, {
requireStatusHealth: true,
probeLabel: "bundled"
});
if (probe.ready) {
return probe;
}
lastProbeError = probe.lastError;
if (lastProbeError && !isGatewayProbeStartupTransient(lastProbeError)) {
return {
ready: false,
checkedAt: new Date().toISOString(),
lastError: lastProbeError
};
}
if (lastProbeError && lastProbeError !== lastLoggedProbeError) {
this.appendLog("warn", `Bundled Gateway is not ready yet: ${lastProbeError}`);
lastLoggedProbeError = lastProbeError;
......@@ -1323,7 +1376,9 @@ export class RuntimeManager extends EventEmitter {
return {
ready: false,
checkedAt: new Date().toISOString(),
lastError: lastProbeError ?? `Timed out while waiting for bundled Gateway readiness at ${this.gatewayConnection.url}.`
lastError: lastProbeError
? `Bundled Gateway cold-start deadline expired at ${this.gatewayConnection.url}; last probe did not complete successfully.`
: `Bundled Gateway cold-start deadline expired at ${this.gatewayConnection.url}.`
};
}
......@@ -1337,7 +1392,8 @@ export class RuntimeManager extends EventEmitter {
}
const firstProbe = await probeGatewayReadiness(this.gatewayConnection.url, this.gatewayConnection.token, GATEWAY_PROBE_TIMEOUT_MS, {
requireStatusHealth: true
requireStatusHealth: true,
probeLabel: "reusable"
});
if (firstProbe.ready || isGatewayProbeNoListener(firstProbe.lastError) || !isGatewayProbeStartupTransient(firstProbe.lastError)) {
return firstProbe;
......@@ -1351,7 +1407,8 @@ export class RuntimeManager extends EventEmitter {
while (Date.now() < deadline) {
await delay(GATEWAY_READY_POLL_INTERVAL_MS);
const probe = await probeGatewayReadiness(this.gatewayConnection.url, this.gatewayConnection.token, GATEWAY_PROBE_TIMEOUT_MS, {
requireStatusHealth: true
requireStatusHealth: true,
probeLabel: "reusable"
});
if (probe.ready || isGatewayProbeNoListener(probe.lastError) || !isGatewayProbeStartupTransient(probe.lastError)) {
return probe;
......@@ -1494,7 +1551,13 @@ export class RuntimeManager extends EventEmitter {
}
private buildStderrHint(): string | undefined {
const recent = this.lastStderrLines.join(" ").toLowerCase();
const recentRaw = this.lastStderrLines.join(" ");
const securityBlock = classifyMacSecurityBlockMessage(recentRaw);
if (securityBlock) {
return securityBlock;
}
const recent = recentRaw.toLowerCase();
if (recent.includes("name conflict") || recent.includes("hostname conflict") || recent.includes("bonjour")) {
return "gateway name/hostname conflict detected (another OpenClaw instance is running)";
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment