Skip to content

Commit

Permalink
Perform retries (on UnknownHostExceptions)
Browse files Browse the repository at this point in the history
If an UnknownHostException occurs (#270), we retry (configurable)
times, as this is sometimes due to DNS (caching) errors
with Java (or the underlying OS).

Fix typos and wording (#343).
  • Loading branch information
ascheman committed Dec 14, 2024
1 parent 59b8370 commit 51f3532
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ public class Configuration {
@Getter(AccessLevel.NONE)
@Builder.Default
Boolean ignoreIPAddresses = false;
@Builder.Default
Integer retries = 0;
/*
* Explanation for configuring http status codes:
* The standard http status codes are defined in class @link NetUtil and can
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


/**
* Check html anchor href attributes
* Check HTML anchor href attributes
*
* @see <a href="https://www.w3schools.com/tags/att_a_href.asp">https://www.w3schools.com/tags/att_a_href.asp</a>
*/
Expand All @@ -32,15 +32,13 @@ class BrokenHttpLinksChecker extends Checker {
private final Set<Integer> successCodes;
private final Set<Integer> warningCodes;
private final Set<Integer> errorCodes;
// all href attributes with http(s) protocol,
// including potential duplicates
// need that to calculate "nrOfOccurrences"
// the pure http/https-hrefs a set, duplicates are removed here
// all href attributes with http(s) protocol, including potential duplicates, need
// to calculate "nrOfOccurrences" the pure http/https-hrefs a set, duplicates are removed here
private Set<String> hrefSet;


BrokenHttpLinksChecker(Configuration pConfig) {
super(pConfig);
BrokenHttpLinksChecker(Configuration configuration) {
super(configuration);

errorCodes = getMyConfig().getHttpErrorCodes();
warningCodes = getMyConfig().getHttpWarningCodes();
Expand All @@ -59,7 +57,7 @@ protected void initCheckingResultsDescription() {
protected SingleCheckResults check(final HtmlPage pageToCheck) {
log.trace("Checking '{}'", pageToCheck.getFile());

//get set of all a-tags "<a href=..." in html file,
// get a set of all a-tags "<a href=..." in HTML file,
// restricted to http(s) links

hrefSet = pageToCheck.getAllHttpHrefStringsAsSet();
Expand All @@ -82,11 +80,11 @@ private void addWarningIfNoInternetConnection() {
}

/**
* check all http(s) links
* Check all http(s) links
* TODO: use GPARS to check several links in parallel, as sequential checking might take too long
**/
private void checkAllHttpLinks() {
// for all hrefSet check if the corresponding link is valid
// Check if the corresponding link is valid for all hrefSet
hrefSet.forEach(this::doubleCheckSingleHttpLink);
}

Expand All @@ -98,8 +96,6 @@ private void checkAllHttpLinks() {
* we try again with a GET, to get the "finalResponseCode" -
* which we then categorize as success, error or warning
*/


protected void doubleCheckSingleHttpLink(String href) {
// bookkeeping:
getCheckingResults().incNrOfChecks();
Expand All @@ -108,14 +104,14 @@ protected void doubleCheckSingleHttpLink(String href) {
URL url = new URL(href);
checkIfLocalhostURL(url, href);
checkIfIPAddress(url, href);
checkHttpLinkWithRetry(url, href);
checkHttpLinkWithRetry(url, href, getMyConfig().getRetries());
} catch (MalformedURLException exception) {
Finding malformedURLFinding = new Finding("malformed URL exception with href=" + href);
getCheckingResults().addFinding(malformedURLFinding);
}
}

private void checkHttpLinkWithRetry(URL url, String href) {
private void checkHttpLinkWithRetry(URL url, String href, int retries) {
String problem;
try {
HttpURLConnection firstConnection = getNewURLConnection(url);
Expand All @@ -124,7 +120,7 @@ private void checkHttpLinkWithRetry(URL url, String href) {
firstConnection.connect();
int responseCode = firstConnection.getResponseCode();

// issue 218 and 219: some web servers respond with 403 or 405
// Issue 218 and 219: some web servers respond with 403 or 405
// when given HEAD requests. Therefore, try to GET
if (successCodes.contains(responseCode)) {
return;
Expand Down Expand Up @@ -169,8 +165,13 @@ else if (Web.HTTP_REDIRECT_CODES.contains(responseCode)) {
firstConnection.disconnect();

} catch (UnknownHostException exception) {
Finding unknownHostFinding = new Finding("Unknown host with href=" + href, exception);
getCheckingResults().addFinding(unknownHostFinding);
if (retries > 0) {
log.warn("Unknown host exception with href={}, retrying", href);
checkHttpLinkWithRetry(url, href, retries - 1);
} else {
Finding unknownHostFinding = new Finding("Unknown host with href=" + href, exception);
getCheckingResults().addFinding(unknownHostFinding);
}
} catch (IOException exception) {
Finding someException = new Finding("exception " + exception + " with href=" + href, exception);
getCheckingResults().addFinding(someException);
Expand Down Expand Up @@ -212,12 +213,12 @@ private void checkIfIPAddress(URL url, String href) {
}
}

// if configured ,localhost-URLs yield warnings!
// if configured, localhost-URLs yield warnings!
private void checkIfLocalhostURL(URL url, String href) {
if (!getMyConfig().isIgnoreLocalhost()) {
String host = url.getHost();
if (("localhost".equals(host)) || host.startsWith("127.0.0")) {
Finding localhostWarning = new Finding("Warning: localhost urls indicates suspicious environment dependency: href=" + href);
Finding localhostWarning = new Finding("Warning: localhost urls indicate suspicious environment dependency: href=" + href);
getCheckingResults().addFinding(localhostWarning);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ class BrokenHttpLinksCheckerSpec extends Specification {
HtmlPage htmlPage
SingleCheckResults collector

private Configuration myConfig
static private int port

@Shared
Expand All @@ -41,8 +40,10 @@ class BrokenHttpLinksCheckerSpec extends Specification {
/* executed before every single spec */

def setup() {
myConfig = new Configuration()
brokenHttpLinksChecker = new BrokenHttpLinksChecker(myConfig)
Configuration configuration = Configuration.builder()
.retries(3)
.build()
brokenHttpLinksChecker = new BrokenHttpLinksChecker(configuration)

collector = new SingleCheckResults()
}
Expand Down
38 changes: 23 additions & 15 deletions htmlSanityCheck-gradle-plugin/README.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ plugins {
id 'org.aim42.{project}' version '{hsc-version}' // <1>
}
----
<1> Checkout <<box:current-version,current version>>

<1> Checkout <<box:current-version,current version>>

=== Legacy Installation

Expand All @@ -52,6 +52,7 @@ buildscript {
apply plugin: 'org.aim42.{project}'
----

<1> In case you would like to use a development version (or even branch), check out <<sec:development-versions,development versions>>.
<2> Beginning with version `2.x` all releases will be published to https://central.sonatype.com/artifact/org.aim42.htmlSanityCheck/org.aim42.htmlSanityCheck.gradle.plugin[Maven Central].
<3> The https://plugins.gradle.org[Gradle Plugin Portal] contains https://plugins.gradle.org/plugin/org.aim42.htmlSanityCheck[most versions] or will redirect downloads of newer versions to Maven Central.
Expand All @@ -61,8 +62,8 @@ apply plugin: 'org.aim42.{project}'
[IMPORTANT]
.Latest (development) versions
====
* The latest https://github.com/aim42/htmlSanityCheck/releases[release version] is defined in https://github.com/aim42/htmlSanityCheck/blob/main/gradle.properties[`gradle.properties` on `main` branch].
* The current (development) version is defined in {gradleProperties}
* https://github.com/aim42/htmlSanityCheck/blob/main/gradle.properties[`gradle.properties` on `main` branch] defines the latest https://github.com/aim42/htmlSanityCheck/releases[release version].
* {gradleProperties} defines the current (development) version.
+
[source]
.Development version
Expand All @@ -79,7 +80,7 @@ The plugin adds a new task named `htmlSanityCheck`.
This task exposes a few properties as part of its configuration:

[horizontal]
`sourceDir` (mandatory):: Directory where the HTML files are located.
`sourceDir` (mandatory):: Directory containing the HTML files to check.
+
Type: Directory.

Expand All @@ -95,14 +96,14 @@ Type: Directory.
+
Default: `+{buildDir}+/reports/htmlSanityCheck/`

`junitResultsDir` (optional):: Directory where the results are written to in JUnit XML format.
JUnit XML can be read by many tools, including CI environments.
`junitResultsDir` (optional):: Directory where HSC writes the results in JUnit XML format.
Many tools support https://github.com/testmoapp/junitxml[JUnit XML] format, including CI environments.
+
Type: Directory.
+
Default: `+{buildDir}+/test-results/htmlchecks/`

`failOnErrors` (optional):: Fail the build if any error was found in the checked pages.
`failOnErrors` (optional):: Fail the build if any checked of the pages contains an error.
+
Type: Boolean.
+
Expand All @@ -126,6 +127,15 @@ Type: Boolean.
+
Default: `false`.

`retries` (optional):: Retry on failed requests (e.g., network issues).
+
*Note*: Currently only used for `UnknownHostException`s (cf.
{project-issues}/270[#270]).
+
Type: Integer.
+
Default: `0`.

`checkerClasses` (optional):: The set of checker classes to be executed.
+
Type: List.
Expand Down Expand Up @@ -155,12 +165,11 @@ include::../htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/tools/W
[NOTE]
.HTTP Redirects
====
Note that HTTP redirects are treated as a warning to make the user aware of the correct or new location (cf. {project-issues}/244[Issue 244]).
Note that HSC treats HTTP redirects as a warning to make the user aware of the correct or new location (cf.
{project-issues}/244[Issue 244]).
Some HSC reports often contain the respective location.
====



`httpErrorCodes` (optional):: Additional HTTP response codes treated as error.
+
Type: List.
Expand Down Expand Up @@ -190,12 +199,13 @@ The lists shown above are the default HTTP response codes handled by HSC.
The mentioned configurations effectively move the configured codes around, i.e., if you add `308` to `httpErrorCodes` it is automatically removed from its default list (`httpWarningCodes`).
****


[[sec:examples]]
== Examples

=== Small Example

`build.gradle`

[source,groovy]
----
apply plugin: 'org.aim42.htmlSanityCheck'
Expand Down Expand Up @@ -269,8 +279,6 @@ asciidoctor {
from( srcImagesPath )
into targetImagesPath
}
}
apply plugin: 'org.aim42.htmlSanityCheck'
Expand Down Expand Up @@ -320,7 +328,7 @@ htmlSanityCheck {

== Compatibility

The Plugin has been tested with the following Gradle versions:
We test the Plugin with the following Gradle versions:

[source,groovy]
.Tested Gradle versions
Expand All @@ -330,7 +338,7 @@ include::src/test/groovy/org/aim42/htmlsanitycheck/gradle/HtmlSanityCheckTaskFun

[NOTE]
====
The full range of Gradle versions is only tested in CI (GitHub action).
We test the full range of Gradle versions only in CI (GitHub action) as it is time-consuming.
The local test only tests the latest Gradle version:
[source,groovy]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ class HtmlSanityCheckTask extends DefaultTask {
@Input
List<Class<? extends Checker>> checkerClasses = AllCheckers.CHECKER_CLASSES

@Input
Integer retries = 0

// private stuff
// **************************************************************************
private Configuration myConfig
Expand Down Expand Up @@ -187,6 +190,7 @@ See ${checkingResultsDir} for a detailed report."""
.ignoreIPAddresses(ignoreIPAddresses)

.checksToExecute(checkerClasses)
.retries(retries)
.build()

// in case we have configured specific interpretations of http status codes
Expand Down

0 comments on commit 51f3532

Please sign in to comment.