feat(v0.5.3): AdoptExistingPersistentSession with race guard, UpdatedAt bump, attach-error propagation
This commit is contained in:
@@ -0,0 +1,206 @@
|
||||
package session
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/warrenronsiek/ctask/internal/lockfile"
|
||||
"github.com/warrenronsiek/ctask/internal/shell"
|
||||
"github.com/warrenronsiek/ctask/internal/workspace"
|
||||
)
|
||||
|
||||
// adoptAttacher and adoptPoll are test seams that *wrap* shell primitives
|
||||
// rather than hand-rolling tmux invocations. Production calls go through
|
||||
// shell.AttachSession / shell.PollSessionEnd. Tests override these
|
||||
// variables; do not run such tests with t.Parallel().
|
||||
var (
|
||||
adoptAttacher = shell.AttachSession
|
||||
adoptPoll = shell.PollSessionEnd
|
||||
)
|
||||
|
||||
// AdoptExistingPersistentSession is the adopted-reattach path. It is
|
||||
// invoked when a tmux session for the workspace exists but the lease is
|
||||
// missing, stale, or fresh-but-from-another-host (the cmd-layer dispatcher
|
||||
// has already prompted for confirmation in the fresh_remote case before
|
||||
// calling here — see cmd/persistent.go::confirmFreshRemoteAdoption).
|
||||
//
|
||||
// Eight-step flow (see v0.5.3-spec.md §3 path C):
|
||||
// 1. Print one diagnostic line announcing the adoption.
|
||||
// 2. Acquire write lock; under lock, re-read lease state. If now
|
||||
// fresh-local (concurrent adopter raced ahead), release the lock and
|
||||
// fall through to AttachExisting — the race winner owns the lease.
|
||||
// Otherwise, remove the orphaned lease, write a new lease for this
|
||||
// process, and bump task.yaml.UpdatedAt.
|
||||
// 3. Capture a fresh start manifest. Without it, finalize has no
|
||||
// reliable diff baseline.
|
||||
// 4. Start the heartbeat goroutine.
|
||||
// 5. shell.AttachSession (returns nil on clean exit, error on non-zero).
|
||||
// 6. Polling loop until tmux reports session gone — runs ONLY after a
|
||||
// successful attach.
|
||||
// 7. Stop the heartbeat goroutine.
|
||||
// 8. finalize with SessionOwnership="adopted" and AdoptedFromOrphanAt set.
|
||||
//
|
||||
// On attach failure (step 5 returns error), steps 6-8 are skipped and the
|
||||
// error is returned — the user sees the underlying tmux failure.
|
||||
func AdoptExistingPersistentSession(tmuxPath, sessionName, wsDir string, opts LaunchOpts) error {
|
||||
fmt.Fprintln(os.Stderr,
|
||||
"[ctask] adopting orphaned persistent session (previous owner exited without finalizing)")
|
||||
|
||||
startTime := time.Now().UTC().Truncate(time.Second)
|
||||
adoptedAt := startTime
|
||||
leasePath := LeasePath(wsDir)
|
||||
|
||||
var raced bool
|
||||
skipped, lockErr := lockfile.WithLock(
|
||||
ctaskWriteLockPath(wsDir),
|
||||
sessionWriteLockTimeout, sessionWriteLockStaleAfter,
|
||||
func() error {
|
||||
// Re-check under lock. If a concurrent adopter beat us to it,
|
||||
// fall through to passive reattach. No lease writes, no
|
||||
// task.yaml.UpdatedAt bump on this branch.
|
||||
if InspectLease(wsDir) == LeaseStateFreshLocal {
|
||||
raced = true
|
||||
return nil
|
||||
}
|
||||
if _, rmErr := CleanupStaleLease(leasePath, StaleLeaseAfter); rmErr != nil {
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"[ctask] warning: failed to remove orphaned lease: %v\n", rmErr)
|
||||
}
|
||||
lease := NewLease(startTime, opts.Agent, opts.Mode)
|
||||
if err := WriteLease(leasePath, lease); err != nil {
|
||||
return fmt.Errorf("writing lease: %w", err)
|
||||
}
|
||||
// Adoption-only: bump task.yaml.UpdatedAt. Passive reattach
|
||||
// (and the race fall-through above) leave it untouched.
|
||||
metaPath := filepath.Join(wsDir, "task.yaml")
|
||||
if meta, err := workspace.ReadMeta(metaPath); err == nil && meta != nil {
|
||||
meta.UpdatedAt = startTime
|
||||
if err := workspace.WriteMeta(metaPath, meta); err != nil {
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"[ctask] warning: failed to update task.yaml on adoption: %v\n", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
},
|
||||
)
|
||||
if skipped {
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"[ctask] Warning: could not acquire metadata lock; falling through to passive reattach\n")
|
||||
return AttachExisting(tmuxPath, sessionName)
|
||||
}
|
||||
if lockErr != nil {
|
||||
return fmt.Errorf("adoption lease write: %w", lockErr)
|
||||
}
|
||||
if raced {
|
||||
return AttachExisting(tmuxPath, sessionName)
|
||||
}
|
||||
|
||||
// Step 3: fresh start manifest — load-bearing for finalize's diff baseline.
|
||||
startManifest, err := CaptureManifest(wsDir)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"[ctask] warning: failed to capture start manifest during adoption: %v; finalize diff will be skipped\n", err)
|
||||
} else {
|
||||
mPath := manifestStartPath(wsDir)
|
||||
if _, lockErr := lockfile.WithLock(
|
||||
ctaskWriteLockPath(wsDir),
|
||||
sessionWriteLockTimeout, sessionWriteLockStaleAfter,
|
||||
func() error { return WriteManifest(mPath, startManifest) },
|
||||
); lockErr != nil {
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"[ctask] warning: failed to write start manifest: %v\n", lockErr)
|
||||
startManifest = nil
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: heartbeat.
|
||||
hb := StartHeartbeat(leasePath, HeartbeatInterval)
|
||||
|
||||
// Step 5: attach. Non-zero exit is a real failure; surface it
|
||||
// and skip steps 6-8 (polling and finalize). Stop the heartbeat first
|
||||
// so we don't leak the goroutine.
|
||||
if attachErr := adoptAttacher(tmuxPath, sessionName); attachErr != nil {
|
||||
hb.Stop()
|
||||
return attachErr
|
||||
}
|
||||
|
||||
// Step 6: poll until session ends.
|
||||
adoptPoll(tmuxPath, sessionName, shell.PollInterval)
|
||||
|
||||
// Step 7: stop heartbeat.
|
||||
hb.Stop()
|
||||
|
||||
// Step 8: finalize with adoption fields.
|
||||
endTime := time.Now().UTC().Truncate(time.Second)
|
||||
if startManifest != nil {
|
||||
if err := finalizeAdopted(opts, wsDir, startManifest, startTime, endTime, adoptedAt); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "[ctask] warning: adoption finalize failed: %v\n", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// finalizeAdopted is the adoption-specific finalize path. Mirrors
|
||||
// session.Run's finalize() but stamps SessionOwnership = "adopted" and
|
||||
// AdoptedFromOrphanAt onto the summary.
|
||||
func finalizeAdopted(opts LaunchOpts, wsDir string, startManifest *Manifest, startTime, endTime, adoptedAt time.Time) error {
|
||||
endManifest, err := CaptureManifest(wsDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("capturing end manifest: %w", err)
|
||||
}
|
||||
diff := DiffManifests(startManifest, endManifest)
|
||||
|
||||
agent := opts.Agent
|
||||
if opts.Shell {
|
||||
agent = "shell"
|
||||
}
|
||||
info := &SessionInfo{
|
||||
Agent: agent,
|
||||
Mode: opts.Mode,
|
||||
StartTime: startTime,
|
||||
EndTime: endTime,
|
||||
Diff: diff,
|
||||
}
|
||||
|
||||
sessionID := NewSessionID(currentHostname(), os.Getpid(), startTime)
|
||||
if l, err := ReadLease(LeasePath(wsDir)); err == nil && l != nil {
|
||||
sessionID = l.SessionID
|
||||
}
|
||||
|
||||
summary := SummarizeFromDiff(
|
||||
sessionID, currentHostname(), agent, opts.Mode,
|
||||
startTime, endTime, diff, endManifest,
|
||||
)
|
||||
summary.EndReason = "tmux_session_ended"
|
||||
summary.DetectedVia = "polling"
|
||||
summary.SessionOwnership = "adopted"
|
||||
summary.AdoptedFromOrphanAt = &adoptedAt
|
||||
|
||||
skipped, lockErr := lockfile.WithLock(
|
||||
ctaskWriteLockPath(wsDir),
|
||||
sessionWriteLockTimeout, sessionWriteLockStaleAfter,
|
||||
func() error {
|
||||
if err := AppendSessionLog(wsDir, info); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "[ctask] warning: append session log failed: %v\n", err)
|
||||
}
|
||||
if err := WriteSummary(SummaryPath(wsDir), summary); err != nil {
|
||||
return fmt.Errorf("write summary: %w", err)
|
||||
}
|
||||
if rmErr := os.Remove(LeasePath(wsDir)); rmErr != nil && !os.IsNotExist(rmErr) {
|
||||
fmt.Fprintf(os.Stderr, "[ctask] warning: could not remove lease: %v\n", rmErr)
|
||||
}
|
||||
if rmErr := os.Remove(manifestStartPath(wsDir)); rmErr != nil && !os.IsNotExist(rmErr) {
|
||||
fmt.Fprintf(os.Stderr, "[ctask] warning: could not remove start manifest: %v\n", rmErr)
|
||||
}
|
||||
return nil
|
||||
},
|
||||
)
|
||||
if skipped {
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"[ctask] Warning: could not acquire metadata lock at adoption finalize; skipping summary write\n")
|
||||
return nil
|
||||
}
|
||||
return lockErr
|
||||
}
|
||||
Reference in New Issue
Block a user