@@ -154,10 +154,16 @@ void TfSchedulerStfInfo::StaleCleanupThread()
154154
155155 std::vector<StfInfo> lStfInfos;
156156
157+ auto lStaleStfTimeoutMs = mDiscoveryConfig ->getUInt64Param (cStaleStfTimeoutMsKey, 5000 ); // 5 seconds
158+
157159 while (mRunning ) {
158- std::this_thread::sleep_for (std::chrono::seconds ( sStfDiscardTimeout ));
160+ std::this_thread::sleep_for (std::chrono::milliseconds ( 250 ));
159161 lLastDiscardTime = std::chrono::steady_clock::now ();
160162
163+ // update housekeeping parameters
164+ lStaleStfTimeoutMs = std::clamp (mDiscoveryConfig ->getUInt64Param (cStaleStfTimeoutMsKey, 5000 ), std::uint64_t (500 ), std::uint64_t (30000 ));
165+ DDDLOG_RL (10000 , " Dropping stale STFs parameter (consul): StaleStfCount={}" , lStaleStfTimeoutMs);
166+
161167 lStfsToErase.clear ();
162168 {
163169 std::unique_lock lLock (mGlobalStfInfoLock );
@@ -175,29 +181,30 @@ void TfSchedulerStfInfo::StaleCleanupThread()
175181 continue ;
176182 }
177183
178- // check reap
184+ // check and reap
179185 const auto &lLastStfInfo = lStfInfoVec.back ();
180186 const auto lTimeDiff = std::chrono::abs (lLastStfInfo.mUpdateLocalTime - lLastDiscardTime);
181- if (lTimeDiff > sStfDiscardTimeout ) {
182- WDDLOG_RL (1000 , " Discarding incomplete SubTimeFrame. stf_id={} received={} expected={}" ,
183- lStfId, lStfInfoVec.size (), lNumStfSenders);
187+
188+ if (lTimeDiff > std::chrono::milliseconds (lStaleStfTimeoutMs)) {
189+ EDDLOG_RL (1000 , " Discarding incomplete TimeFrame. stf_id={} received={} expected={} lastUpdateMs={}" ,
190+ lStfId, lStfInfoVec.size (), lNumStfSenders, std::chrono::duration_cast<std::chrono::milliseconds>(lTimeDiff).count ());
191+
192+ // erase the stale stf
193+ lStfsToErase.push_back (lStfId);
184194
185195 // find missing StfSenders
186196 std::set<std::string> lMissingStfSenders = lStfSenderIdSet;
187-
188197 for (const auto &lUpdate : lStfInfoVec) {
189198 lMissingStfSenders.erase (lUpdate.process_id ());
190199 }
191200
192- std::string lMissingIds = boost::algorithm::join (lMissingStfSenders, " , " );
193- DDDLOG (" Missing STFs from StfSender IDs: {}" , lMissingIds);
194- EDDLOG_RL (5000 , " Missing STFs from StfSender IDs: {}" , lMissingIds);
195-
196201 for (const auto &lStf : lMissingStfSenders) {
197202 lStfSenderMissingCnt[lStf]++;
198203 }
199204
200- lStfsToErase.push_back (lStfId);
205+ DDDLOG_RL (1000 , " Missing STFs from StfSender IDs: {}" , boost::algorithm::join (lMissingStfSenders, " , " ));
206+ } else {
207+ break ; // stop checking as soon the early STFs are within the stale timeout
201208 }
202209 }
203210
@@ -208,8 +215,13 @@ void TfSchedulerStfInfo::StaleCleanupThread()
208215 }
209216
210217 if (lStfsToErase.size () > 0 ) {
211- WDDLOG (" SchedulingThread: TFs have been discarded due to incomplete number of STFs. discarded_tf_count={}" ,
212- lStfsToErase.size ());
218+ static std::uint64_t mStaleTfCount = 0 ;
219+ mStaleTfCount += lStfsToErase.size ();
220+
221+ WDDLOG_RL (2000 , " StaleStfDropThread: TFs have been discarded due to incomplete number of STFs. discarded_tf_count={} total={}" ,
222+ lStfsToErase.size (), mStaleTfCount );
223+ DDMON (" tfscheduler" , " tf.rejected.stale_not_completed_stf" , mStaleTfCount );
224+ DDMON (" tfscheduler" , " tf.rejected.total" , mNotScheduledTfsCount );
213225
214226 for (const auto &lStfSenderCnt : lStfSenderMissingCnt) {
215227 if (lStfSenderCnt.second > 0 ) {
0 commit comments