Skip to content

Commit 92ca505

Browse files
committed
tfscheduler: add parameter for stale TF duration (consul)
1 parent d0c1d14 commit 92ca505

3 files changed

Lines changed: 32 additions & 15 deletions

File tree

script/README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,5 +62,8 @@ Make sure that only a single instance of the proxy is started.
6262
## Consul parameters (tuning option)
6363

6464
### TfScheduler
65-
*MaxNumTfInBuilding* (`epn/data-dist/parameters/TfScheduler/MaxNumTfInBuilding`)
66-
Override the number of TFs each TfBuilder is allowed to build concurrently. This variable can be increased if a small number of EPNs is used.
65+
*MaxNumTfInBuilding* (`epn/data-dist/parameters/TfScheduler/MaxNumTfInBuilding`)
66+
Override the number of TFs each TfBuilder is allowed to build concurrently. This variable can be increased if a small number of EPNs is used.
67+
68+
*StaleStfTimeoutMs* (`epn/data-dist/parameters/TfScheduler/StaleStfTimeoutMs`)
69+
Timeout (milliseconds) at which point the non-complete TFs will be scheduled for building or deletion. Default is 5000 (5s)

src/TfScheduler/TfSchedulerStfInfo.cxx

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,16 @@ void TfSchedulerStfInfo::StaleCleanupThread()
154154

155155
std::vector<StfInfo> lStfInfos;
156156

157+
auto lStaleStfTimeoutMs = mDiscoveryConfig->getUInt64Param(cStaleStfTimeoutMsKey, 5000); // 5 seconds
158+
157159
while (mRunning) {
158-
std::this_thread::sleep_for(std::chrono::seconds(sStfDiscardTimeout));
160+
std::this_thread::sleep_for(std::chrono::milliseconds(250));
159161
lLastDiscardTime = std::chrono::steady_clock::now();
160162

163+
// update housekeeping parameters
164+
lStaleStfTimeoutMs = std::clamp(mDiscoveryConfig->getUInt64Param(cStaleStfTimeoutMsKey, 5000), std::uint64_t(500), std::uint64_t(30000));
165+
DDDLOG_RL(10000, "Dropping stale STFs parameter (consul): StaleStfCount={}", lStaleStfTimeoutMs);
166+
161167
lStfsToErase.clear();
162168
{
163169
std::unique_lock lLock(mGlobalStfInfoLock);
@@ -175,29 +181,30 @@ void TfSchedulerStfInfo::StaleCleanupThread()
175181
continue;
176182
}
177183

178-
// check reap
184+
// check and reap
179185
const auto &lLastStfInfo = lStfInfoVec.back();
180186
const auto lTimeDiff = std::chrono::abs(lLastStfInfo.mUpdateLocalTime - lLastDiscardTime);
181-
if (lTimeDiff > sStfDiscardTimeout) {
182-
WDDLOG_RL(1000, "Discarding incomplete SubTimeFrame. stf_id={} received={} expected={}",
183-
lStfId, lStfInfoVec.size(), lNumStfSenders);
187+
188+
if (lTimeDiff > std::chrono::milliseconds(lStaleStfTimeoutMs)) {
189+
EDDLOG_RL(1000, "Discarding incomplete TimeFrame. stf_id={} received={} expected={} lastUpdateMs={}",
190+
lStfId, lStfInfoVec.size(), lNumStfSenders, std::chrono::duration_cast<std::chrono::milliseconds>(lTimeDiff).count());
191+
192+
// erase the stale stf
193+
lStfsToErase.push_back(lStfId);
184194

185195
// find missing StfSenders
186196
std::set<std::string> lMissingStfSenders = lStfSenderIdSet;
187-
188197
for (const auto &lUpdate : lStfInfoVec) {
189198
lMissingStfSenders.erase(lUpdate.process_id());
190199
}
191200

192-
std::string lMissingIds = boost::algorithm::join(lMissingStfSenders, ", ");
193-
DDDLOG("Missing STFs from StfSender IDs: {}", lMissingIds);
194-
EDDLOG_RL(5000, "Missing STFs from StfSender IDs: {}", lMissingIds);
195-
196201
for (const auto &lStf : lMissingStfSenders) {
197202
lStfSenderMissingCnt[lStf]++;
198203
}
199204

200-
lStfsToErase.push_back(lStfId);
205+
DDDLOG_RL(1000, "Missing STFs from StfSender IDs: {}", boost::algorithm::join(lMissingStfSenders, ", "));
206+
} else {
207+
break; // stop checking as soon the early STFs are within the stale timeout
201208
}
202209
}
203210

@@ -208,8 +215,13 @@ void TfSchedulerStfInfo::StaleCleanupThread()
208215
}
209216

210217
if (lStfsToErase.size() > 0) {
211-
WDDLOG("SchedulingThread: TFs have been discarded due to incomplete number of STFs. discarded_tf_count={}",
212-
lStfsToErase.size());
218+
static std::uint64_t mStaleTfCount = 0;
219+
mStaleTfCount += lStfsToErase.size();
220+
221+
WDDLOG_RL(2000, "StaleStfDropThread: TFs have been discarded due to incomplete number of STFs. discarded_tf_count={} total={}",
222+
lStfsToErase.size(), mStaleTfCount);
223+
DDMON("tfscheduler", "tf.rejected.stale_not_completed_stf", mStaleTfCount);
224+
DDMON("tfscheduler", "tf.rejected.total", mNotScheduledTfsCount);
213225

214226
for (const auto &lStfSenderCnt : lStfSenderMissingCnt) {
215227
if (lStfSenderCnt.second > 0) {

src/TfScheduler/TfSchedulerStfInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ struct TopoStfInfo {
7676

7777
class TfSchedulerStfInfo
7878
{
79+
static constexpr std::string_view cStaleStfTimeoutMsKey = "StaleStfTimeoutMs";
80+
7981
public:
8082
TfSchedulerStfInfo() = delete;
8183
TfSchedulerStfInfo(std::shared_ptr<ConsulTfSchedulerInstance> pDiscoveryConfig,

0 commit comments

Comments
 (0)