@@ -60,7 +60,11 @@ type requestCache struct {
6060 regionReqs map [SubscriptionID ]map [uint64 ]regionReq
6161 }
6262
63- // counter for sent but not initialized requests
63+ // pendingCount is a flow control slot counter.
64+ // A slot is acquired when a request is successfully enqueued into pendingQueue (see add),
65+ // and is released when the request is finished/removed (resolve/markStopped/markDone/clear).
66+ // pop and markSent don't change it. If markSent overwrites an existing request for the same region,
67+ // it will release a slot for the replaced request to avoid leaking pendingCount.
6468 pendingCount atomic.Int64
6569 // maximum number of pending requests allowed
6670 maxPendingCount int64
@@ -104,12 +108,10 @@ func (c *requestCache) add(ctx context.Context, region regionInfo, force bool) (
104108 case <- ctx .Done ():
105109 return false , ctx .Err ()
106110 case c .pendingQueue <- req :
107- c .incPendingCount ()
111+ c .pendingCount . Inc ()
108112 cost := time .Since (start )
109113 metrics .SubscriptionClientAddRegionRequestDuration .Observe (cost .Seconds ())
110114 return true , nil
111- case <- c .spaceAvailable :
112- continue
113115 case <- ticker .C :
114116 addReqRetryLimit --
115117 if addReqRetryLimit <= 0 {
@@ -135,7 +137,9 @@ func (c *requestCache) add(ctx context.Context, region regionInfo, force bool) (
135137 }
136138}
137139
138- // pop gets the next pending request, returns nil if queue is empty
140+ // pop gets the next pending request.
141+ // Note: it doesn't change pendingCount. The slot acquired in add() should be released later
142+ // (e.g. resolve/markStopped/markDone).
139143func (c * requestCache ) pop (ctx context.Context ) (regionReq , error ) {
140144 select {
141145 case req := <- c .pendingQueue :
@@ -145,7 +149,8 @@ func (c *requestCache) pop(ctx context.Context) (regionReq, error) {
145149 }
146150}
147151
148- // markSent marks a request as sent and adds it to sent requests
152+ // markSent marks a request as sent and adds it to sent requests.
153+ // It doesn't change pendingCount: the slot is released when the request is finished/removed.
149154func (c * requestCache ) markSent (req regionReq ) {
150155 c .sentRequests .Lock ()
151156 defer c .sentRequests .Unlock ()
@@ -157,10 +162,20 @@ func (c *requestCache) markSent(req regionReq) {
157162 c .sentRequests .regionReqs [req .regionInfo .subscribedSpan .subID ] = m
158163 }
159164
165+ if oldReq , exists := m [req .regionInfo .verID .GetID ()]; exists {
166+ log .Warn ("region request overwritten" ,
167+ zap .Uint64 ("subID" , uint64 (req .regionInfo .subscribedSpan .subID )),
168+ zap .Uint64 ("regionID" , req .regionInfo .verID .GetID ()),
169+ zap .Float64 ("oldAgeSec" , time .Since (oldReq .createTime ).Seconds ()),
170+ zap .Float64 ("newAgeSec" , time .Since (req .createTime ).Seconds ()),
171+ zap .Int ("pendingCount" , int (c .pendingCount .Load ())),
172+ zap .Int ("pendingQueueLen" , len (c .pendingQueue )))
173+ c .markDone ()
174+ }
160175 m [req .regionInfo .verID .GetID ()] = req
161176}
162177
163- // markStopped removes a sent request without changing pending count (for stopped regions)
178+ // markStopped removes a sent request and releases a slot.
164179func (c * requestCache ) markStopped (subID SubscriptionID , regionID uint64 ) {
165180 c .sentRequests .Lock ()
166181 defer c .sentRequests .Unlock ()
@@ -176,12 +191,7 @@ func (c *requestCache) markStopped(subID SubscriptionID, regionID uint64) {
176191 }
177192
178193 delete (regionReqs , regionID )
179- c .decPendingCount ()
180- // Notify waiting add operations that there's space available
181- select {
182- case c .spaceAvailable <- struct {}{}:
183- default : // If channel is full, skip notification
184- }
194+ c .markDone ()
185195}
186196
187197// resolve marks a region as initialized and removes it from sent requests
@@ -201,19 +211,14 @@ func (c *requestCache) resolve(subscriptionID SubscriptionID, regionID uint64) b
201211 // Check if the subscription ID matches
202212 if req .regionInfo .subscribedSpan .subID == subscriptionID {
203213 delete (regionReqs , regionID )
204- c .decPendingCount ()
214+ c .markDone ()
205215 cost := time .Since (req .createTime ).Seconds ()
206216 if cost > 0 && cost < abnormalRequestDurationInSec {
207217 log .Debug ("cdc resolve region request" , zap .Uint64 ("subID" , uint64 (subscriptionID )), zap .Uint64 ("regionID" , regionID ), zap .Float64 ("cost" , cost ), zap .Int ("pendingCount" , int (c .pendingCount .Load ())), zap .Int ("pendingQueueLen" , len (c .pendingQueue )))
208218 metrics .RegionRequestFinishScanDuration .Observe (cost )
209219 } else {
210220 log .Info ("region request duration abnormal, skip metric" , zap .Float64 ("cost" , cost ), zap .Uint64 ("regionID" , regionID ))
211221 }
212- // Notify waiting add operations that there's space available
213- select {
214- case c .spaceAvailable <- struct {}{}:
215- default : // If channel is full, skip notification
216- }
217222 return true
218223 }
219224
@@ -235,8 +240,8 @@ func (c *requestCache) clearStaleRequest() {
235240 regionReq .regionInfo .subscribedSpan .stopped .Load () ||
236241 regionReq .regionInfo .lockedRangeState .Initialized .Load () ||
237242 regionReq .isStale () {
238- c .decPendingCount ()
239- log .Info ("region worker delete stale region request" ,
243+ c .markDone ()
244+ log .Warn ("region worker delete stale region request" ,
240245 zap .Uint64 ("subID" , uint64 (subID )),
241246 zap .Uint64 ("regionID" , regionID ),
242247 zap .Int ("pendingCount" , int (c .pendingCount .Load ())),
@@ -247,17 +252,27 @@ func (c *requestCache) clearStaleRequest() {
247252 zap .Time ("createTime" , regionReq .createTime ))
248253 delete (regionReqs , regionID )
249254 } else {
250- reqCount += 1
255+ reqCount ++
251256 }
252257 }
253258 if len (regionReqs ) == 0 {
254259 delete (c .sentRequests .regionReqs , subID )
255260 }
256261 }
257262
258- if reqCount == 0 && c .pendingCount .Load () != 0 {
259- log .Info ("region worker pending request count is not equal to actual region request count, correct it" , zap .Int ("pendingCount" , int (c .pendingCount .Load ())), zap .Int ("actualReqCount" , reqCount ))
263+ // If there are no in-cache region requests but pendingCount isn't 0, it means pendingCount is stale.
264+ // Reset it to avoid blocking add() forever.
265+ if reqCount == 0 && len (c .pendingQueue ) == 0 && c .pendingCount .Load () != 0 {
266+ log .Info ("region worker pending request count is not equal to actual region request count, correct it" ,
267+ zap .Int ("pendingCount" , int (c .pendingCount .Load ())),
268+ zap .Int ("actualReqCount" , reqCount ),
269+ zap .Int ("pendingQueueLen" , len (c .pendingQueue )))
260270 c .pendingCount .Store (0 )
271+ // Notify waiting add operations that there's space available.
272+ select {
273+ case c .spaceAvailable <- struct {}{}:
274+ default :
275+ }
261276 }
262277
263278 c .lastCheckStaleRequestTime .Store (time .Now ())
@@ -273,7 +288,7 @@ LOOP:
273288 select {
274289 case req := <- c .pendingQueue :
275290 regions = append (regions , req .regionInfo )
276- c .decPendingCount ()
291+ c .markDone ()
277292 default :
278293 break LOOP
279294 }
@@ -286,7 +301,7 @@ LOOP:
286301 for regionID := range regionReqs {
287302 regions = append (regions , regionReqs [regionID ].regionInfo )
288303 delete (regionReqs , regionID )
289- c .decPendingCount ()
304+ c .markDone ()
290305 }
291306 delete (c .sentRequests .regionReqs , subID )
292307 }
@@ -298,17 +313,26 @@ func (c *requestCache) getPendingCount() int {
298313 return int (c .pendingCount .Load ())
299314}
300315
301- func (c * requestCache ) incPendingCount () {
302- c .pendingCount .Inc ()
303- }
304-
305- func (c * requestCache ) decPendingCount () {
306- // Ensure pendingCount doesn't go below 0
307- current := c .pendingCount .Load ()
308- newCount := current - int64 (1 )
309- if newCount < 0 {
310- c .pendingCount .Store (0 )
311- return
316+ func (c * requestCache ) markDone () {
317+ // Decrement pendingCount by 1, but never let it go below 0.
318+ // Do it with CAS to avoid clobbering concurrent Inc() calls.
319+ for {
320+ old := c .pendingCount .Load ()
321+ if old == 0 {
322+ break
323+ } else if old < 0 {
324+ if c .pendingCount .CompareAndSwap (old , 0 ) {
325+ break
326+ }
327+ } else {
328+ if c .pendingCount .CompareAndSwap (old , old - 1 ) {
329+ break
330+ }
331+ }
332+ }
333+ // Notify waiting add operations that there's space available.
334+ select {
335+ case c .spaceAvailable <- struct {}{}:
336+ default : // If channel is full, skip notification
312337 }
313- c .pendingCount .Dec ()
314338}
0 commit comments