Skip to content

Commit 54b4a69

Browse files
tarunramsinghaniazure-pipelines-bot
andauthored
AgentServer connections ignore VSTS_HTTP_TIMEOUT environment variable and use hardcoded timeout values (#5432)
* Fix: Respect VSTS_HTTP_TIMEOUT environment variable in AgentServer The agent listener was ignoring the VSTS_HTTP_TIMEOUT environment variable and using hardcoded timeout values (60s for message/request connections, 100s for generic connection) when establishing server connections. This caused timeout errors even when users set VSTS_HTTP_TIMEOUT to higher values like 300 seconds, making the environment variable non-functional. Changes: - Modified AgentServer.ConnectAsync() to read timeout from AgentKnobs.HttpTimeout - Applied valid range constraint [100, 1200] seconds as documented - All three connections now use the same configurable timeout - Worker connections already respect this variable via VssUtil.CreateConnection() Fixes issue where agents in slow network environments experienced frequent timeouts despite setting VSTS_HTTP_TIMEOUT environment variable. Related logs showing the issue: [INFO EnvironmentCapabilitiesProvider] Adding 'VSTS_HTTP_TIMEOUT': '300' [INFO AgentServer] Establish connection with 60 seconds timeout. [ERR MessageListener] System.TimeoutException: HTTP request timed out after 00:01:00 * Fix: Respect VSTS_HTTP_TIMEOUT environment variable for agent connections - Add GetDefaultConnectionTimeout() helper to centralize timeout logic - Update ConnectAsync to use environment variable timeout - Update RefreshConnectionAsync to accept optional timeout (defaults to env var) - Update ResetConnectionTimeout to accept optional timeout (defaults to env var) - Rename SetConnectionTimeout to ResetConnectionTimeout for clarity - Keep 30s timeout for error recovery scenarios (intentional fast failure) - Update JobDispatcherL0 tests to handle nullable TimeSpan parameters Fixes agent server ignoring VSTS_HTTP_TIMEOUT setting, causing unexpected 60-second timeouts instead of configured values (100-1200 second range). * reverting to 60 sec for JobRequest connection type * fixing override --------- Co-authored-by: azure-pipelines-bot <[email protected]>
1 parent 05a617e commit 54b4a69

File tree

4 files changed

+73
-24
lines changed

4 files changed

+73
-24
lines changed

src/Agent.Listener/JobDispatcher.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -401,10 +401,10 @@ private async Task RunAsync(Pipelines.AgentJobRequestMessage message, WorkerDisp
401401
// Because an agent can be idle for a long time between jobs, it is possible that in that time
402402
// a firewall has closed the connection. For that reason, forcibly reestablish this connection at the
403403
// start of a new job
404-
Trace.Info(StringUtil.Format("Refreshing server connection before job execution [ConnectionType:JobRequest, Timeout:30s, JobId:{0}]",
404+
Trace.Info(StringUtil.Format("Refreshing server connection before job execution [ConnectionType:JobRequest, JobId:{0}]",
405405
message.JobId));
406406
var agentServer = HostContext.GetService<IAgentServer>();
407-
await agentServer.RefreshConnectionAsync(AgentConnectionType.JobRequest, TimeSpan.FromSeconds(30));
407+
await agentServer.RefreshConnectionAsync(AgentConnectionType.JobRequest);
408408

409409
// start renew job request
410410
Trace.Info($"Start renew job request {requestId} for job {message.JobId}.");
@@ -816,9 +816,9 @@ public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToke
816816
if (encounteringError > 0)
817817
{
818818
encounteringError = 0;
819-
agentServer.SetConnectionTimeout(AgentConnectionType.JobRequest, TimeSpan.FromSeconds(60));
819+
agentServer.ResetConnectionTimeout(AgentConnectionType.JobRequest);
820820
HostContext.WritePerfCounter("JobRenewRecovered");
821-
Trace.Info(StringUtil.Format("Job renewal error recovery completed [RequestId:{0}, ErrorsCleared:True, ConnectionTimeout:60s, Status:Recovered]",
821+
Trace.Info(StringUtil.Format("Job renewal error recovery completed [RequestId:{0}, ErrorsCleared:True, Status:Recovered]",
822822
requestId));
823823
}
824824

@@ -888,7 +888,7 @@ public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToke
888888
}
889889

890890
// Re-establish connection to server in order to avoid affinity with server.
891-
// Reduce connection timeout to 30 seconds (from 60s)
891+
// Reduce connection timeout to 30 seconds for faster failure detection during retries
892892
HostContext.WritePerfCounter("ResetJobRenewConnection");
893893
Trace.Info(StringUtil.Format("Job renewal connection refresh initiated [RequestId:{0}, Timeout:30s, Reason:RetryRecovery, ErrorCount:{1}]",
894894
requestId, encounteringError));

src/Agent.Listener/MessageListener.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ public async Task<TaskAgentMessage> GetNextMessageAsync(CancellationToken token)
244244
}
245245

246246
// re-create VssConnection before next retry
247-
await _agentServer.RefreshConnectionAsync(AgentConnectionType.MessageQueue, TimeSpan.FromSeconds(60));
247+
await _agentServer.RefreshConnectionAsync(AgentConnectionType.MessageQueue);
248248

249249
Trace.Info(StringUtil.Format("Sleeping for {0} seconds before retrying.", _getNextMessageRetryInterval.TotalSeconds));
250250
await HostContext.Delay(_getNextMessageRetryInterval, token);

src/Microsoft.VisualStudio.Services.Agent/AgentServer.cs

Lines changed: 63 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
using Microsoft.VisualStudio.Services.WebApi;
1212
using Microsoft.VisualStudio.Services.Common;
1313
using Agent.Sdk.Util;
14+
using Agent.Sdk.Knob;
1415

1516
namespace Microsoft.VisualStudio.Services.Agent
1617
{
@@ -26,9 +27,9 @@ public interface IAgentServer : IAgentService
2627
{
2728
Task ConnectAsync(Uri serverUrl, VssCredentials credentials);
2829

29-
Task RefreshConnectionAsync(AgentConnectionType connectionType, TimeSpan timeout);
30+
Task RefreshConnectionAsync(AgentConnectionType connectionType, TimeSpan? timeout = null);
3031

31-
void SetConnectionTimeout(AgentConnectionType connectionType, TimeSpan timeout);
32+
void ResetConnectionTimeout(AgentConnectionType connectionType, TimeSpan? timeout = null);
3233

3334
// Configuration
3435
Task<TaskAgent> AddAgentAsync(Int32 agentPoolId, TaskAgent agent);
@@ -73,12 +74,22 @@ public async Task ConnectAsync(Uri serverUrl, VssCredentials credentials)
7374

7475
// Establish the first connection before doing the rest in parallel to eliminate the redundant 401s.
7576
// issue: https://github.com/microsoft/azure-pipelines-agent/issues/3149
76-
Task<VssConnection> task1 = EstablishVssConnection(serverUrl, credentials, TimeSpan.FromSeconds(100));
77+
78+
// Read timeout from environment variable (VSTS_HTTP_TIMEOUT), default to 100 seconds
79+
TimeSpan connectionTimeout = GetConnectionTimeout(AgentConnectionType.Generic, null);
80+
Task<VssConnection> task1 = EstablishVssConnection(serverUrl, credentials, connectionTimeout);
7781

7882
_genericConnection = await task1;
7983

80-
Task<VssConnection> task2 = EstablishVssConnection(serverUrl, credentials, TimeSpan.FromSeconds(60));
81-
Task<VssConnection> task3 = EstablishVssConnection(serverUrl, credentials, TimeSpan.FromSeconds(60));
84+
// MessageQueue connection for long-polling - uses full timeout from environment variable
85+
TimeSpan messageQueueTimeout = GetConnectionTimeout(AgentConnectionType.MessageQueue, null);
86+
Task<VssConnection> task2 = EstablishVssConnection(serverUrl, credentials, messageQueueTimeout);
87+
88+
// JobRequest connection uses capped timeout (max 100s) to ensure job lock renewals
89+
// happen within the ~5 minute lock window. With 60s between renewals, timeout must be
90+
// < 240s to avoid lock expiration, so we cap at 100s for safety.
91+
TimeSpan jobRequestTimeout = GetConnectionTimeout(AgentConnectionType.JobRequest, null);
92+
Task<VssConnection> task3 = EstablishVssConnection(serverUrl, credentials, jobRequestTimeout);
8293

8394
await Task.WhenAll(task2, task3);
8495

@@ -95,8 +106,10 @@ public async Task ConnectAsync(Uri serverUrl, VssCredentials credentials)
95106
}
96107

97108
// Refresh connection is best effort. it should never throw exception
98-
public async Task RefreshConnectionAsync(AgentConnectionType connectionType, TimeSpan timeout)
109+
public async Task RefreshConnectionAsync(AgentConnectionType connectionType, TimeSpan? timeout = null)
99110
{
111+
TimeSpan actualTimeout = GetConnectionTimeout(connectionType, timeout);
112+
100113
Trace.Info($"Refresh {connectionType} VssConnection to get on a different AFD node.");
101114
VssConnection newConnection = null;
102115
switch (connectionType)
@@ -105,7 +118,7 @@ public async Task RefreshConnectionAsync(AgentConnectionType connectionType, Tim
105118
try
106119
{
107120
_hasMessageConnection = false;
108-
newConnection = await EstablishVssConnection(_messageConnection.Uri, _messageConnection.Credentials, timeout);
121+
newConnection = await EstablishVssConnection(_messageConnection.Uri, _messageConnection.Credentials, actualTimeout);
109122
var client = newConnection.GetClient<TaskAgentHttpClient>();
110123
_messageConnection = newConnection;
111124
_messageTaskAgentClient = client;
@@ -130,7 +143,7 @@ public async Task RefreshConnectionAsync(AgentConnectionType connectionType, Tim
130143
try
131144
{
132145
_hasRequestConnection = false;
133-
newConnection = await EstablishVssConnection(_requestConnection.Uri, _requestConnection.Credentials, timeout);
146+
newConnection = await EstablishVssConnection(_requestConnection.Uri, _requestConnection.Credentials, actualTimeout);
134147
var client = newConnection.GetClient<TaskAgentHttpClient>();
135148
_requestConnection = newConnection;
136149
_requestTaskAgentClient = client;
@@ -155,7 +168,7 @@ public async Task RefreshConnectionAsync(AgentConnectionType connectionType, Tim
155168
try
156169
{
157170
_hasGenericConnection = false;
158-
newConnection = await EstablishVssConnection(_genericConnection.Uri, _genericConnection.Credentials, timeout);
171+
newConnection = await EstablishVssConnection(_genericConnection.Uri, _genericConnection.Credentials, actualTimeout);
159172
var client = newConnection.GetClient<TaskAgentHttpClient>();
160173
_genericConnection = newConnection;
161174
_genericTaskAgentClient = client;
@@ -182,19 +195,21 @@ public async Task RefreshConnectionAsync(AgentConnectionType connectionType, Tim
182195
}
183196
}
184197

185-
public void SetConnectionTimeout(AgentConnectionType connectionType, TimeSpan timeout)
198+
public void ResetConnectionTimeout(AgentConnectionType connectionType, TimeSpan? timeout = null)
186199
{
187-
Trace.Info($"Set {connectionType} VssConnection's timeout to {timeout.TotalSeconds} seconds.");
200+
TimeSpan actualTimeout = GetConnectionTimeout(connectionType, timeout);
201+
202+
Trace.Info($"Set {connectionType} VssConnection's timeout to {actualTimeout.TotalSeconds} seconds.");
188203
switch (connectionType)
189204
{
190205
case AgentConnectionType.JobRequest:
191-
_requestConnection.Settings.SendTimeout = timeout;
206+
_requestConnection.Settings.SendTimeout = actualTimeout;
192207
break;
193208
case AgentConnectionType.MessageQueue:
194-
_messageConnection.Settings.SendTimeout = timeout;
209+
_messageConnection.Settings.SendTimeout = actualTimeout;
195210
break;
196211
case AgentConnectionType.Generic:
197-
_genericConnection.Settings.SendTimeout = timeout;
212+
_genericConnection.Settings.SendTimeout = actualTimeout;
198213
break;
199214
default:
200215
Trace.Error($"Unexpected connection type: {connectionType}.");
@@ -229,6 +244,40 @@ private async Task<VssConnection> EstablishVssConnection(Uri serverUrl, VssCrede
229244
throw new InvalidOperationException(nameof(EstablishVssConnection));
230245
}
231246

247+
/// <summary>
248+
/// Gets the connection timeout for the specified connection type.
249+
/// Reads from VSTS_HTTP_TIMEOUT environment variable with valid range [100, 1200] seconds.
250+
/// JobRequest connections are capped at 60s to ensure job lock renewals complete within
251+
/// the ~5 minute lock window (60s timeout + 60s delay = 120s < 300s).
252+
/// </summary>
253+
/// <param name="connectionType">Type of connection</param>
254+
/// <param name="timeout">Optional explicit timeout (overrides environment variable but not type-specific caps)</param>
255+
/// <returns>Timeout value to use</returns>
256+
private TimeSpan GetConnectionTimeout(AgentConnectionType connectionType, TimeSpan? timeout)
257+
{
258+
TimeSpan actualTimeout;
259+
260+
if (timeout.HasValue)
261+
{
262+
actualTimeout = timeout.Value;
263+
}
264+
else
265+
{
266+
// Read from environment variable, clamped to valid range [100, 1200]
267+
int httpRequestTimeoutSeconds = AgentKnobs.HttpTimeout.GetValue(HostContext).AsInt();
268+
actualTimeout = TimeSpan.FromSeconds(Math.Min(Math.Max(httpRequestTimeoutSeconds, 100), 1200));
269+
}
270+
271+
// Cap JobRequest timeout to 60s to ensure renewals complete within job lock window
272+
// This applies to both explicit and environment variable timeouts
273+
if (connectionType == AgentConnectionType.JobRequest)
274+
{
275+
return TimeSpan.FromSeconds(Math.Min(actualTimeout.TotalSeconds, 60));
276+
}
277+
278+
return actualTimeout;
279+
}
280+
232281
private void CheckConnection(AgentConnectionType connectionType)
233282
{
234283
switch (connectionType)

src/Test/L0/Listener/JobDispatcherL0.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -330,8 +330,8 @@ public async void DispatcherRenewJobRequestRecoverFromExceptions()
330330
Assert.True(firstJobRequestRenewed.Task.IsCompletedSuccessfully, "First renew should succeed.");
331331
Assert.True(cancellationTokenSource.IsCancellationRequested);
332332
_agentServer.Verify(x => x.RenewAgentRequestAsync(It.IsAny<int>(), It.IsAny<long>(), It.IsAny<Guid>(), It.IsAny<CancellationToken>()), Times.Exactly(8));
333-
_agentServer.Verify(x => x.RefreshConnectionAsync(AgentConnectionType.JobRequest, It.IsAny<TimeSpan>()), Times.Exactly(3));
334-
_agentServer.Verify(x => x.SetConnectionTimeout(AgentConnectionType.JobRequest, It.IsAny<TimeSpan>()), Times.Once);
333+
_agentServer.Verify(x => x.RefreshConnectionAsync(AgentConnectionType.JobRequest, It.IsAny<TimeSpan?>()), Times.Exactly(3));
334+
_agentServer.Verify(x => x.ResetConnectionTimeout(AgentConnectionType.JobRequest, It.IsAny<TimeSpan?>()), Times.Once);
335335
}
336336
}
337337

@@ -453,8 +453,8 @@ public async void DispatcherRenewJobRequestStopOnExpiredRequest()
453453
Assert.True(firstJobRequestRenewed.Task.IsCompletedSuccessfully, "First renew should succeed.");
454454
Assert.False(cancellationTokenSource.IsCancellationRequested);
455455
_agentServer.Verify(x => x.RenewAgentRequestAsync(It.IsAny<int>(), It.IsAny<long>(), It.IsAny<Guid>(), It.IsAny<CancellationToken>()), Times.Exactly(5));
456-
_agentServer.Verify(x => x.RefreshConnectionAsync(AgentConnectionType.JobRequest, It.IsAny<TimeSpan>()), Times.Exactly(3));
457-
_agentServer.Verify(x => x.SetConnectionTimeout(AgentConnectionType.JobRequest, It.IsAny<TimeSpan>()), Times.Never);
456+
_agentServer.Verify(x => x.RefreshConnectionAsync(AgentConnectionType.JobRequest, It.IsAny<TimeSpan?>()), Times.Exactly(3));
457+
_agentServer.Verify(x => x.ResetConnectionTimeout(AgentConnectionType.JobRequest, It.IsAny<TimeSpan?>()), Times.Never);
458458
}
459459
}
460460

0 commit comments

Comments
 (0)