diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fd68ae5..fd5ed6e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,28 @@ on: branches: [main] workflow_call: +# CI flake mitigation: +# ci.yml is triggered TWICE per PR on the same commit — once directly via +# the `pull_request` trigger above ("Frontend Tests & Build" check) and once +# via `workflow_call` from docker-publish.yml ("CI Gate / Frontend Tests & +# Build" check). Both jobs land on the same Actions runner pool at the same +# time and fight for CPU/RAM. Under contention, React's reconciliation in +# `messagesViewFirstContact.test.tsx > removes an approved contact …` +# overruns its 5s waitFor timeout — that's the single failure mode we've +# seen flake on PRs #226, #237, #261, #262, #265, #294, #303, and the +# fd7d6fa push. Backend tests and every other frontend test pass under +# the same conditions, which is what made this look random. +# +# Pinning a concurrency group on the SHA (PR head, or the pushed commit +# for main) serializes the two invocations so neither starves the other. +# We use cancel-in-progress: false so the second one queues instead of +# cancelling — cancelling could leave the PR check stuck "Expected" if +# only one of the two ever finishes. Total CI time grows by ~2 min in +# exchange for deterministic outcomes. +concurrency: + group: ci-${{ github.event.pull_request.head.sha || github.sha }} + cancel-in-progress: false + jobs: frontend: name: Frontend Tests & Build diff --git a/frontend/src/__tests__/mesh/messagesViewFirstContact.test.tsx b/frontend/src/__tests__/mesh/messagesViewFirstContact.test.tsx index dec5baa..b781d97 100644 --- a/frontend/src/__tests__/mesh/messagesViewFirstContact.test.tsx +++ b/frontend/src/__tests__/mesh/messagesViewFirstContact.test.tsx @@ -868,18 +868,27 @@ describe('MessagesView first-contact trust UX', () => { // event (removeContact + setContacts + setComposeStatus + setComposeError). // Under CI load the resulting render-and-paint cycle has been observed // to take >1s, which is the default findByText timeout — that race has - // produced flakes on PRs #226, #237, #261, and #262 in succession. - // The settle window is bounded by React's reconciliation, not by any - // network/animation cost, so a generous timeout is the right deflake - // here (the failure mode this masks would be "toast never renders", - // which would still fail at 5s). + // produced flakes on PRs #226, #237, #261, #262, #265, #294, #303, and + // the fd7d6fa push. + // + // The structural root cause is fixed in .github/workflows/ci.yml via a + // concurrency group (ci.yml runs twice in parallel per PR — direct + // trigger + workflow_call from docker-publish.yml — and both jobs land + // on the same Actions runner pool, starving each other). Serialising + // them via concurrency removes the resource contention. + // + // We also bump the timeout here as belt-and-suspenders. The settle + // window is bounded by React's reconciliation, not by any network or + // animation cost, so a generous timeout is the right deflake (the + // failure mode this masks would be "toast never renders", which still + // fails at 15s). await waitFor( () => { expect( screen.getByText(/Removed contact: Remove Me\./i), ).toBeInTheDocument(); }, - { timeout: 5000, interval: 50 }, + { timeout: 15000, interval: 50 }, ); expect(screen.queryByText('Remove Me')).not.toBeInTheDocument(); });