From 9cf9f58dc00134cb249d4b79fc0f27af2b002bb3 Mon Sep 17 00:00:00 2001
From: Petri-Johan Last <petri.last@sourcegraph.com>
Date: Wed, 29 Mar 2023 10:11:36 +0200
Subject: [PATCH] Docs and comment updates explaining GitHub rate limit
 decisions (#50088)

---
 .../background-information/github-api-oddities.md  |  6 ++++++
 internal/extsvc/github/v3.go                       | 14 +++++++++++++-
 internal/extsvc/github/v4.go                       | 13 +++++++++++++
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/doc/dev/background-information/github-api-oddities.md b/doc/dev/background-information/github-api-oddities.md
index d1707f15196..1ea51a72495 100644
--- a/doc/dev/background-information/github-api-oddities.md
+++ b/doc/dev/background-information/github-api-oddities.md
@@ -29,3 +29,9 @@ None of these options are consistent, and all of them can change while paginatin
 What this effectively means is that, when syncing a large number of repositories (the more pages, the worse it gets), successive syncs will find repositories it previously missed, and miss repositories it previously found, even though those repositories should all still be part of the search.
 
 As of yet, there does not seem to be a way around this that is not extremely inefficient.
+
+## Rate limits
+
+Unlike GitLab, Azure DevOps, or Bitbucket Cloud, when GitHub tells us a request was rejected because of rate limits, they respond with a 403 Forbidden, instead of a 429 Too Many Requests. So we need to depend on the response headers to determine whether or not we need to retry the request.
+
+However, there are extremely unlikely scenarios where we can receive rate limited response headers, but not check them until they're outdated, and if they're outdated we can't trust them (can be leftover from an old request and just was not updated). So then we don't retry the request, even though a retry would have worked.
diff --git a/internal/extsvc/github/v3.go b/internal/extsvc/github/v3.go
index 84b3b7793c8..6bf98e4e862 100644
--- a/internal/extsvc/github/v3.go
+++ b/internal/extsvc/github/v3.go
@@ -241,7 +241,19 @@ func (c *V3Client) request(ctx context.Context, req *http.Request, result any) (
 	// 3. We succeed
 	for c.waitForRateLimit && err != nil && numRetries < c.maxRateLimitRetries &&
 		errors.As(err, &apiError) && apiError.Code == http.StatusForbidden {
-		// If we end up waiting because of an external rate limit, we need to retry the request.
+		// Because GitHub responds with http.StatusForbidden when a rate limit is hit, we cannot
+		// say with absolute certainty that a rate limit was hit. It might have been an honest
+		// http.StatusForbidden. So we use the externalRateLimiter's WaitForRateLimit function
+		// to calculate the amount of time we need to wait before retrying the request.
+		// If that calculated time is zero or in the past, we have to assume that the
+		// rate limiting information we have is old and no longer relevant.
+		//
+		// There is an extremely unlikely edge case where we will falsely not retry a request.
+		// If a request is rejected because we have no more rate limit tokens, but the token reset
+		// time is just around the corner (like 1 second from now), and for some reason the time
+		// between reading the headers and doing this "should we retry" check is greater than
+		// that time, the rate limit information we will have will look like old information and
+		// we won't retry the request.
 		if c.externalRateLimiter.WaitForRateLimit(ctx, 1) {
 			req.Body = io.NopCloser(bytes.NewBuffer(reqBody))
 			resp, err = doRequest(ctx, c.log, c.apiURL, c.auth, c.externalRateLimiter, c.httpClient, req, result)
diff --git a/internal/extsvc/github/v4.go b/internal/extsvc/github/v4.go
index d2662b9a9cd..c2d24bcffc2 100644
--- a/internal/extsvc/github/v4.go
+++ b/internal/extsvc/github/v4.go
@@ -175,6 +175,19 @@ func (c *V4Client) requestGraphQL(ctx context.Context, query string, vars map[st
 	for c.waitForRateLimit && err != nil && numRetries < c.maxRateLimitRetries &&
 		errors.As(err, &apiError) && apiError.Code == http.StatusForbidden {
 		req.Body = io.NopCloser(bytes.NewBuffer(reqBody))
+		// Because GitHub responds with http.StatusForbidden when a rate limit is hit, we cannot
+		// say with absolute certainty that a rate limit was hit. It might have been an honest
+		// http.StatusForbidden. So we use the externalRateLimiter's WaitForRateLimit function
+		// to calculate the amount of time we need to wait before retrying the request.
+		// If that calculated time is zero or in the past, we have to assume that the
+		// rate limiting information we have is old and no longer relevant.
+		//
+		// There is an extremely unlikely edge case where we will falsely not retry a request.
+		// If a request is rejected because we have no more rate limit tokens, but the token reset
+		// time is just around the corner (like 1 second from now), and for some reason the time
+		// between reading the headers and doing this "should we retry" check is greater than
+		// that time, the rate limit information we will have will look like old information and
+		// we won't retry the request.
 		if c.externalRateLimiter.WaitForRateLimit(ctx, cost) {
 			_, err = doRequest(ctx, c.log, c.apiURL, c.auth, c.externalRateLimiter, c.httpClient, req, &respBody)
 			numRetries++