fix: resilient subscription CLI calls (retry, richer errors, capped concurrency)

The 'recon failed (claude subscription CLI failed: )' was a transient CLI failure (rate limit / cold start) reported with a blank message and no retry. - chat_cli: on non-zero exit, surface exit code + stdout (CLI writes the real reason there, not stderr); treat empty output as an error - pool.one(): retry up to 3x with backoff for transient failures (both subscription and API paths) - with_auth: cap concurrency to 3 on the subscription path — spawning many parallel CLI processes itself trips provider rate limits Verified: live subscription run recovers and completes recon → select → exploit → vote → artifacts. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 23:05:30 +02:00 · 2026-06-23 13:07:55 -03:00
parent 9dfcea87bc
commit c6fd5d6ac8
2 changed files with 46 additions and 10 deletions
@@ -180,10 +180,29 @@ impl ChatClient {
            // Drop closes stdin so the CLI processes the prompt and exits.
        }
        let out = child.wait_with_output().await?;
+        let stdout = String::from_utf8_lossy(&out.stdout).trim().to_string();
+        let stderr = String::from_utf8_lossy(&out.stderr);
        if !out.status.success() {
-            return Err(anyhow!("{} subscription CLI failed: {}", bin, truncate(&String::from_utf8_lossy(&out.stderr), 200)));
+            // The CLI often writes the real reason (rate limit, auth) to stdout,
+            // not stderr — surface both plus the exit code so the error isn't blank.
+            let detail = if !stderr.trim().is_empty() {
+                stderr.trim().to_string()
+            } else if !stdout.is_empty() {
+                stdout.clone()
+            } else {
+                "no output".to_string()
+            };
+            return Err(anyhow!(
+                "{} subscription CLI exit {}: {}",
+                bin,
+                out.status.code().map(|c| c.to_string()).unwrap_or_else(|| "signal".into()),
+                truncate(&detail, 240)
+            ));
        }
-        Ok(String::from_utf8_lossy(&out.stdout).trim().to_string())
+        if stdout.is_empty() {
+            return Err(anyhow!("{} subscription CLI returned empty output", bin));
+        }
+        Ok(stdout)
    }
 }

@@ -28,7 +28,9 @@ impl ModelPool {
        subscription: bool,
        mcp_config: Option<String>,
    ) -> Self {
-        let concurrency = concurrency.max(1);
+        // Subscription spawns one CLI process per call; too many in parallel
+        // trips provider rate limits, so cap concurrency on that path.
+        let concurrency = if subscription { concurrency.clamp(1, 3) } else { concurrency.max(1) };
        ModelPool {
            client: ChatClient::new(),
            sem: Arc::new(Semaphore::new(concurrency)),
@@ -42,15 +44,30 @@ impl ModelPool {
        }
    }

-    /// One completion for a model, via subscription CLI (optionally with MCP) or HTTP API.
+    /// One completion for a model, via subscription CLI (optionally with MCP) or
+    /// HTTP API, with a short retry/backoff to ride out transient failures
+    /// (rate limits, MCP cold-starts, network blips).
    async fn one(&self, m: &ModelRef, system: &str, user: &str) -> Result<String> {
-        if self.subscription && cli_binary_for(&m.provider).is_some() {
-            return self
-                .client
-                .chat_cli(&m.provider, &m.model, system, user, self.mcp_config.as_deref())
-                .await;
+        let use_cli = self.subscription && cli_binary_for(&m.provider).is_some();
+        let mut last = anyhow::anyhow!("no attempt");
+        for attempt in 0..3u64 {
+            if attempt > 0 {
+                // 1.5s, 4.5s backoff.
+                tokio::time::sleep(std::time::Duration::from_millis(1500 * attempt * attempt.max(1))).await;
+            }
+            let r = if use_cli {
+                self.client
+                    .chat_cli(&m.provider, &m.model, system, user, self.mcp_config.as_deref())
+                    .await
+            } else {
+                self.client.chat(m, system, user).await
+            };
+            match r {
+                Ok(t) => return Ok(t),
+                Err(e) => last = e,
+            }
        }
-        self.client.chat(m, system, user).await
+        Err(last)
    }

    /// Complete a prompt, trying each candidate model until one succeeds.