feat: record stream interruption reasons via StreamStatus

- Add StreamStatus type (relay/common) to track stream end reason
  (done/timeout/client_gone/scanner_error/eof/panic/ping_fail) and
  accumulate soft errors during streaming via sync.Once + sync.Mutex.
- Add StreamResult (relay/helper) as the callback interface: adapters
  call sr.Error() for soft errors, sr.Stop() for fatal, sr.Done() for
  normal completion. No early-return problem — multiple errors per chunk
  are naturally supported.
- Refactor StreamScannerHandler callback from func(string) bool to
  func(string, *StreamResult). All 9 channel adapters updated.
- Write stream_status into log other JSON field (admin-only) with
  status ok/error, end_reason, error_count, and error messages.
- Frontend: display stream status in log detail expansion for admins.
This commit is contained in:
CaIon
2026-03-31 16:50:24 +08:00
parent 5402bf417d
commit 5238f279db
20 changed files with 764 additions and 163 deletions
+52
View File
@@ -0,0 +1,52 @@
package helper
import (
relaycommon "github.com/QuantumNous/new-api/relay/common"
)
// StreamResult is passed to each dataHandler invocation, providing methods
// to record soft errors, signal fatal stops, or mark normal completion.
// StreamScannerHandler checks IsStopped() after each callback invocation.
type StreamResult struct {
status *relaycommon.StreamStatus
stopped bool
}
func newStreamResult(status *relaycommon.StreamStatus) *StreamResult {
return &StreamResult{status: status}
}
// Error records a soft error. The stream continues processing.
// Can be called multiple times per chunk.
func (r *StreamResult) Error(err error) {
if err == nil {
return
}
r.status.RecordError(err.Error())
}
// Stop records a fatal error and marks the stream to stop after this chunk.
func (r *StreamResult) Stop(err error) {
if err != nil {
r.status.RecordError(err.Error())
}
r.status.SetEndReason(relaycommon.StreamEndReasonHandlerStop, err)
r.stopped = true
}
// Done signals that the handler has finished processing normally
// (e.g., Dify "message_end"). The stream stops after this chunk.
func (r *StreamResult) Done() {
r.status.SetEndReason(relaycommon.StreamEndReasonDone, nil)
r.stopped = true
}
// IsStopped returns whether Stop() or Done() was called during this chunk.
func (r *StreamResult) IsStopped() bool {
return r.stopped
}
// reset clears the per-chunk stopped flag so the object can be reused.
func (r *StreamResult) reset() {
r.stopped = false
}
+26 -10
View File
@@ -34,12 +34,15 @@ func getScannerBufferSize() int {
return DefaultMaxScannerBufferSize
}
func StreamScannerHandler(c *gin.Context, resp *http.Response, info *relaycommon.RelayInfo, dataHandler func(data string) bool) {
func StreamScannerHandler(c *gin.Context, resp *http.Response, info *relaycommon.RelayInfo, dataHandler func(data string, sr *StreamResult)) {
if resp == nil || dataHandler == nil {
return
}
// 无条件新建 StreamStatus
info.StreamStatus = relaycommon.NewStreamStatus()
// 确保响应体总是被关闭
defer func() {
if resp.Body != nil {
@@ -121,6 +124,7 @@ func StreamScannerHandler(c *gin.Context, resp *http.Response, info *relaycommon
wg.Done()
if r := recover(); r != nil {
logger.LogError(c, fmt.Sprintf("ping goroutine panic: %v", r))
info.StreamStatus.SetEndReason(relaycommon.StreamEndReasonPanic, fmt.Errorf("ping panic: %v", r))
common.SafeSendBool(stopChan, true)
}
if common.DebugEnabled {
@@ -148,6 +152,7 @@ func StreamScannerHandler(c *gin.Context, resp *http.Response, info *relaycommon
case err := <-done:
if err != nil {
logger.LogError(c, "ping data error: "+err.Error())
info.StreamStatus.SetEndReason(relaycommon.StreamEndReasonPingFail, err)
return
}
if common.DebugEnabled {
@@ -155,6 +160,7 @@ func StreamScannerHandler(c *gin.Context, resp *http.Response, info *relaycommon
}
case <-time.After(10 * time.Second):
logger.LogError(c, "ping data send timeout")
info.StreamStatus.SetEndReason(relaycommon.StreamEndReasonPingFail, fmt.Errorf("ping send timeout"))
return
case <-ctx.Done():
return
@@ -184,14 +190,17 @@ func StreamScannerHandler(c *gin.Context, resp *http.Response, info *relaycommon
wg.Done()
if r := recover(); r != nil {
logger.LogError(c, fmt.Sprintf("data handler goroutine panic: %v", r))
info.StreamStatus.SetEndReason(relaycommon.StreamEndReasonPanic, fmt.Errorf("handler panic: %v", r))
}
common.SafeSendBool(stopChan, true)
}()
sr := newStreamResult(info.StreamStatus)
for data := range dataChan {
sr.reset()
writeMutex.Lock()
success := dataHandler(data)
dataHandler(data, sr)
writeMutex.Unlock()
if !success {
if sr.IsStopped() {
return
}
}
@@ -205,6 +214,7 @@ func StreamScannerHandler(c *gin.Context, resp *http.Response, info *relaycommon
wg.Done()
if r := recover(); r != nil {
logger.LogError(c, fmt.Sprintf("scanner goroutine panic: %v", r))
info.StreamStatus.SetEndReason(relaycommon.StreamEndReasonPanic, fmt.Errorf("scanner panic: %v", r))
}
common.SafeSendBool(stopChan, true)
if common.DebugEnabled {
@@ -220,6 +230,7 @@ func StreamScannerHandler(c *gin.Context, resp *http.Response, info *relaycommon
case <-ctx.Done():
return
case <-c.Request.Context().Done():
info.StreamStatus.SetEndReason(relaycommon.StreamEndReasonClientGone, c.Request.Context().Err())
return
default:
}
@@ -253,7 +264,7 @@ func StreamScannerHandler(c *gin.Context, resp *http.Response, info *relaycommon
return
}
} else {
// done, 处理完成标志,直接退出停止读取剩余数据防止出错
info.StreamStatus.SetEndReason(relaycommon.StreamEndReasonDone, nil)
if common.DebugEnabled {
println("received [DONE], stopping scanner")
}
@@ -264,20 +275,25 @@ func StreamScannerHandler(c *gin.Context, resp *http.Response, info *relaycommon
if err := scanner.Err(); err != nil {
if err != io.EOF {
logger.LogError(c, "scanner error: "+err.Error())
info.StreamStatus.SetEndReason(relaycommon.StreamEndReasonScannerErr, err)
}
}
info.StreamStatus.SetEndReason(relaycommon.StreamEndReasonEOF, nil)
})
// 主循环等待完成或超时
select {
case <-ticker.C:
// 超时处理逻辑
logger.LogError(c, "streaming timeout")
info.StreamStatus.SetEndReason(relaycommon.StreamEndReasonTimeout, nil)
case <-stopChan:
// 正常结束
logger.LogInfo(c, "streaming finished")
// EndReason already set by the goroutine that triggered stopChan
case <-c.Request.Context().Done():
// 客户端断开连接
logger.LogInfo(c, "client disconnected")
info.StreamStatus.SetEndReason(relaycommon.StreamEndReasonClientGone, c.Request.Context().Err())
}
if info.StreamStatus.IsNormalEnd() && !info.StreamStatus.HasErrors() {
logger.LogInfo(c, fmt.Sprintf("stream ended: %s", info.StreamStatus.Summary()))
} else {
logger.LogError(c, fmt.Sprintf("stream ended: %s, received=%d", info.StreamStatus.Summary(), info.ReceivedResponseCount))
}
}
+216 -47
View File
@@ -56,8 +56,6 @@ func buildSSEBody(n int) string {
return b.String()
}
// slowReader wraps a reader and injects a delay before each Read call,
// simulating a slow upstream that trickles data.
type slowReader struct {
r io.Reader
delay time.Duration
@@ -79,7 +77,7 @@ func TestStreamScannerHandler_NilInputs(t *testing.T) {
info := &relaycommon.RelayInfo{ChannelMeta: &relaycommon.ChannelMeta{}}
StreamScannerHandler(c, nil, info, func(data string) bool { return true })
StreamScannerHandler(c, nil, info, func(data string, sr *StreamResult) {})
StreamScannerHandler(c, &http.Response{Body: io.NopCloser(strings.NewReader(""))}, info, nil)
}
@@ -89,9 +87,8 @@ func TestStreamScannerHandler_EmptyBody(t *testing.T) {
c, resp, info := setupStreamTest(t, strings.NewReader(""))
var called atomic.Bool
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
called.Store(true)
return true
})
assert.False(t, called.Load(), "handler should not be called for empty body")
@@ -105,9 +102,8 @@ func TestStreamScannerHandler_1000Chunks(t *testing.T) {
c, resp, info := setupStreamTest(t, strings.NewReader(body))
var count atomic.Int64
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
count.Add(1)
return true
})
assert.Equal(t, int64(numChunks), count.Load())
@@ -124,9 +120,8 @@ func TestStreamScannerHandler_10000Chunks(t *testing.T) {
var count atomic.Int64
start := time.Now()
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
count.Add(1)
return true
})
elapsed := time.Since(start)
@@ -145,11 +140,10 @@ func TestStreamScannerHandler_OrderPreserved(t *testing.T) {
var mu sync.Mutex
received := make([]string, 0, numChunks)
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
mu.Lock()
received = append(received, data)
mu.Unlock()
return true
})
require.Equal(t, numChunks, len(received))
@@ -166,31 +160,32 @@ func TestStreamScannerHandler_DoneStopsScanner(t *testing.T) {
c, resp, info := setupStreamTest(t, strings.NewReader(body))
var count atomic.Int64
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
count.Add(1)
return true
})
assert.Equal(t, int64(50), count.Load(), "data after [DONE] must not be processed")
}
func TestStreamScannerHandler_HandlerFailureStops(t *testing.T) {
func TestStreamScannerHandler_StopStopsStream(t *testing.T) {
t.Parallel()
const numChunks = 200
body := buildSSEBody(numChunks)
c, resp, info := setupStreamTest(t, strings.NewReader(body))
const failAt = 50
const stopAt int64 = 50
var count atomic.Int64
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
n := count.Add(1)
return n < failAt
if n >= stopAt {
sr.Stop(fmt.Errorf("fatal at %d", n))
}
})
// The worker stops at failAt; the scanner may have read ahead,
// but the handler should not be called beyond failAt.
assert.Equal(t, int64(failAt), count.Load())
assert.Equal(t, stopAt, count.Load())
require.NotNil(t, info.StreamStatus)
assert.Equal(t, relaycommon.StreamEndReasonHandlerStop, info.StreamStatus.EndReason)
}
func TestStreamScannerHandler_SkipsNonDataLines(t *testing.T) {
@@ -210,9 +205,8 @@ func TestStreamScannerHandler_SkipsNonDataLines(t *testing.T) {
c, resp, info := setupStreamTest(t, strings.NewReader(b.String()))
var count atomic.Int64
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
count.Add(1)
return true
})
assert.Equal(t, int64(100), count.Load())
@@ -225,25 +219,18 @@ func TestStreamScannerHandler_DataWithExtraSpaces(t *testing.T) {
c, resp, info := setupStreamTest(t, strings.NewReader(body))
var got string
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
got = data
return true
})
assert.Equal(t, "{\"trimmed\":true}", got)
}
// ---------- Decoupling: scanner not blocked by slow handler ----------
// ---------- Decoupling ----------
func TestStreamScannerHandler_ScannerDecoupledFromSlowHandler(t *testing.T) {
t.Parallel()
// Strategy: use a slow upstream (io.Pipe, 10ms per chunk) AND a slow handler (20ms per chunk).
// If the scanner were synchronously coupled to the handler, total time would be
// ~numChunks * (10ms + 20ms) = 30ms * 50 = 1500ms.
// With decoupling, total time should be closer to
// ~numChunks * max(10ms, 20ms) = 20ms * 50 = 1000ms
// because the scanner reads ahead into the buffer while the handler processes.
const numChunks = 50
const upstreamDelay = 10 * time.Millisecond
const handlerDelay = 20 * time.Millisecond
@@ -273,10 +260,9 @@ func TestStreamScannerHandler_ScannerDecoupledFromSlowHandler(t *testing.T) {
start := time.Now()
done := make(chan struct{})
go func() {
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
time.Sleep(handlerDelay)
count.Add(1)
return true
})
close(done)
}()
@@ -293,7 +279,6 @@ func TestStreamScannerHandler_ScannerDecoupledFromSlowHandler(t *testing.T) {
coupledTime := time.Duration(numChunks) * (upstreamDelay + handlerDelay)
t.Logf("elapsed=%v, coupled_estimate=%v", elapsed, coupledTime)
// If decoupled, elapsed should be well under the coupled estimate.
assert.Less(t, elapsed, coupledTime*85/100,
"decoupled elapsed time (%v) should be significantly less than coupled estimate (%v)", elapsed, coupledTime)
}
@@ -311,9 +296,8 @@ func TestStreamScannerHandler_SlowUpstreamFastHandler(t *testing.T) {
done := make(chan struct{})
go func() {
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
count.Add(1)
return true
})
close(done)
}()
@@ -344,8 +328,6 @@ func TestStreamScannerHandler_PingSentDuringSlowUpstream(t *testing.T) {
setting.PingIntervalSeconds = oldSeconds
})
// Create a reader that delivers data slowly: one chunk every 500ms over 3.5 seconds.
// The ping interval is 1s, so we should see at least 2 pings.
pr, pw := io.Pipe()
go func() {
defer pw.Close()
@@ -372,9 +354,8 @@ func TestStreamScannerHandler_PingSentDuringSlowUpstream(t *testing.T) {
var count atomic.Int64
done := make(chan struct{})
go func() {
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
count.Add(1)
return true
})
close(done)
}()
@@ -436,9 +417,8 @@ func TestStreamScannerHandler_PingDisabledByRelayInfo(t *testing.T) {
var count atomic.Int64
done := make(chan struct{})
go func() {
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
count.Add(1)
return true
})
close(done)
}()
@@ -456,6 +436,199 @@ func TestStreamScannerHandler_PingDisabledByRelayInfo(t *testing.T) {
assert.Equal(t, 0, pingCount, "pings should be disabled when DisablePing=true")
}
// ---------- StreamStatus integration ----------
func TestStreamScannerHandler_StreamStatus_DoneReason(t *testing.T) {
t.Parallel()
body := buildSSEBody(10)
c, resp, info := setupStreamTest(t, strings.NewReader(body))
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {})
require.NotNil(t, info.StreamStatus)
assert.Equal(t, relaycommon.StreamEndReasonDone, info.StreamStatus.EndReason)
assert.Nil(t, info.StreamStatus.EndError)
assert.True(t, info.StreamStatus.IsNormalEnd())
assert.False(t, info.StreamStatus.HasErrors())
}
func TestStreamScannerHandler_StreamStatus_EOFWithoutDone(t *testing.T) {
t.Parallel()
var b strings.Builder
for i := 0; i < 5; i++ {
fmt.Fprintf(&b, "data: {\"id\":%d}\n", i)
}
c, resp, info := setupStreamTest(t, strings.NewReader(b.String()))
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {})
require.NotNil(t, info.StreamStatus)
assert.Equal(t, relaycommon.StreamEndReasonEOF, info.StreamStatus.EndReason)
assert.True(t, info.StreamStatus.IsNormalEnd())
}
func TestStreamScannerHandler_StreamStatus_HandlerStop(t *testing.T) {
t.Parallel()
body := buildSSEBody(100)
c, resp, info := setupStreamTest(t, strings.NewReader(body))
var count atomic.Int64
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
n := count.Add(1)
if n >= 10 {
sr.Stop(fmt.Errorf("stop at 10"))
}
})
require.NotNil(t, info.StreamStatus)
assert.Equal(t, relaycommon.StreamEndReasonHandlerStop, info.StreamStatus.EndReason)
assert.True(t, info.StreamStatus.HasErrors())
}
func TestStreamScannerHandler_StreamStatus_HandlerDone(t *testing.T) {
t.Parallel()
body := buildSSEBody(20)
c, resp, info := setupStreamTest(t, strings.NewReader(body))
var count atomic.Int64
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
n := count.Add(1)
if n >= 5 {
sr.Done()
}
})
assert.Equal(t, int64(5), count.Load())
require.NotNil(t, info.StreamStatus)
assert.Equal(t, relaycommon.StreamEndReasonDone, info.StreamStatus.EndReason)
assert.False(t, info.StreamStatus.HasErrors())
}
func TestStreamScannerHandler_StreamStatus_Timeout(t *testing.T) {
// Not parallel: modifies global constant.StreamingTimeout
oldTimeout := constant.StreamingTimeout
constant.StreamingTimeout = 2
t.Cleanup(func() { constant.StreamingTimeout = oldTimeout })
pr, pw := io.Pipe()
go func() {
fmt.Fprint(pw, "data: {\"id\":1}\n")
time.Sleep(10 * time.Second)
pw.Close()
}()
recorder := httptest.NewRecorder()
c, _ := gin.CreateTestContext(recorder)
c.Request = httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
resp := &http.Response{Body: pr}
info := &relaycommon.RelayInfo{ChannelMeta: &relaycommon.ChannelMeta{}}
done := make(chan struct{})
go func() {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {})
close(done)
}()
select {
case <-done:
case <-time.After(15 * time.Second):
t.Fatal("timed out waiting for stream timeout")
}
require.NotNil(t, info.StreamStatus)
assert.Equal(t, relaycommon.StreamEndReasonTimeout, info.StreamStatus.EndReason)
assert.False(t, info.StreamStatus.IsNormalEnd())
}
func TestStreamScannerHandler_StreamStatus_SoftErrors(t *testing.T) {
t.Parallel()
body := buildSSEBody(10)
c, resp, info := setupStreamTest(t, strings.NewReader(body))
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
sr.Error(fmt.Errorf("soft error for chunk"))
})
require.NotNil(t, info.StreamStatus)
assert.Equal(t, relaycommon.StreamEndReasonDone, info.StreamStatus.EndReason)
assert.True(t, info.StreamStatus.HasErrors())
assert.Equal(t, 10, info.StreamStatus.TotalErrorCount())
}
func TestStreamScannerHandler_StreamStatus_MultipleErrorsPerChunk(t *testing.T) {
t.Parallel()
body := buildSSEBody(5)
c, resp, info := setupStreamTest(t, strings.NewReader(body))
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
sr.Error(fmt.Errorf("error A"))
sr.Error(fmt.Errorf("error B"))
})
require.NotNil(t, info.StreamStatus)
assert.Equal(t, relaycommon.StreamEndReasonDone, info.StreamStatus.EndReason)
assert.Equal(t, 10, info.StreamStatus.TotalErrorCount())
}
func TestStreamScannerHandler_StreamStatus_ErrorThenStop(t *testing.T) {
t.Parallel()
// Use a large body without [DONE] to avoid race between scanner's [DONE]
// and handler's Stop on the sync.Once EndReason.
var b strings.Builder
for i := 0; i < 100; i++ {
fmt.Fprintf(&b, "data: {\"id\":%d}\n", i)
}
c, resp, info := setupStreamTest(t, strings.NewReader(b.String()))
var count atomic.Int64
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
count.Add(1)
sr.Error(fmt.Errorf("soft error"))
sr.Stop(fmt.Errorf("fatal"))
})
assert.Equal(t, int64(1), count.Load())
require.NotNil(t, info.StreamStatus)
assert.Equal(t, relaycommon.StreamEndReasonHandlerStop, info.StreamStatus.EndReason)
assert.Equal(t, 2, info.StreamStatus.TotalErrorCount())
}
func TestStreamScannerHandler_StreamStatus_InitializedIfNil(t *testing.T) {
t.Parallel()
body := buildSSEBody(1)
c, resp, info := setupStreamTest(t, strings.NewReader(body))
assert.Nil(t, info.StreamStatus)
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {})
assert.NotNil(t, info.StreamStatus)
}
func TestStreamScannerHandler_StreamStatus_PreInitialized(t *testing.T) {
t.Parallel()
body := buildSSEBody(5)
c, resp, info := setupStreamTest(t, strings.NewReader(body))
info.StreamStatus = relaycommon.NewStreamStatus()
info.StreamStatus.RecordError("pre-existing error")
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {})
assert.Equal(t, relaycommon.StreamEndReasonDone, info.StreamStatus.EndReason)
assert.Equal(t, 1, info.StreamStatus.TotalErrorCount())
}
func TestStreamScannerHandler_PingInterleavesWithSlowUpstream(t *testing.T) {
t.Parallel()
@@ -469,9 +642,6 @@ func TestStreamScannerHandler_PingInterleavesWithSlowUpstream(t *testing.T) {
setting.PingIntervalSeconds = oldSeconds
})
// Slow upstream + slow handler. Total stream takes ~5 seconds.
// The ping goroutine stays alive as long as the scanner is reading,
// so pings should fire between data writes.
pr, pw := io.Pipe()
go func() {
defer pw.Close()
@@ -498,9 +668,8 @@ func TestStreamScannerHandler_PingInterleavesWithSlowUpstream(t *testing.T) {
var count atomic.Int64
done := make(chan struct{})
go func() {
StreamScannerHandler(c, resp, info, func(data string) bool {
StreamScannerHandler(c, resp, info, func(data string, sr *StreamResult) {
count.Add(1)
return true
})
close(done)
}()