diff --git a/application/src/main/java/org/togetherjava/tjbot/features/utils/LinkDetection.java b/application/src/main/java/org/togetherjava/tjbot/features/utils/LinkDetection.java index 3b6dc18112..f98cb6c06d 100644 --- a/application/src/main/java/org/togetherjava/tjbot/features/utils/LinkDetection.java +++ b/application/src/main/java/org/togetherjava/tjbot/features/utils/LinkDetection.java @@ -4,27 +4,62 @@ import com.linkedin.urls.detection.UrlDetector; import com.linkedin.urls.detection.UrlDetectorOptions; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.concurrent.CompletableFuture; /** - * Utility class to detect links. + * Utility methods for working with links inside arbitrary text. + * + *

+ * This class can: + *

+ * + *

+ * It is intentionally stateless and uses asynchronous HTTP requests to avoid blocking calling + * threads. */ + public class LinkDetection { + private static final HttpClient HTTP_CLIENT = HttpClient.newHttpClient(); /** - * Possible ways to filter a link. + * Default filters applied when extracting links from text. * - * @see LinkDetection + *

+ * Links to intentionally ignore in order to reduce false positives when scanning chat messages + * or source-code snippets. + */ + + private static final Set DEFAULT_FILTERS = + Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME); + + /** + * Filters that control which detected URLs are returned by {@link #extractLinks}. */ public enum LinkFilter { /** - * Filters links suppressed with {@literal }. + * Ignores URLs that are wrapped in angle brackets, e.g. {@code }. + * + *

+ * Such links are often intentionally suppressed in chat platforms. */ SUPPRESSED, /** - * Filters links that are not using http scheme. + * Ignores URLs that do not use the HTTP or HTTPS scheme. + * + *

+ * This helps avoid false positives such as {@code ftp://}, {@code file://}, or scheme-less + * matches. */ NON_HTTP_SCHEME } @@ -34,30 +69,187 @@ private LinkDetection() { } /** - * Extracts all links from the given content. + * Extracts links from the given text. + * + *

+ * The text is scanned using a URL detector, then filtered and normalized according to the + * provided {@link LinkFilter}s. + * + *

+ * Example: * - * @param content the content to search through - * @param filter the filters applied to the urls - * @return a list of all found links, can be empty + *

{@code
+     * Set filters = Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME);
+     * extractLinks("Visit https://example.com and ", filters)
+     * // returns ["https://example.com"]
+     * }
+ * + * @param content the text to scan for links + * @param filter a set of filters controlling which detected links are returned + * @return a list of extracted links in the order they appear in the text */ + public static List extractLinks(String content, Set filter) { return new UrlDetector(content, UrlDetectorOptions.BRACKET_MATCH).detect() - .stream() - .map(url -> toLink(url, filter)) - .flatMap(Optional::stream) - .toList(); + .stream() + .map(url -> toLink(url, filter)) + .flatMap(Optional::stream) + .toList(); } /** - * Checks whether the given content contains a link. + * Extracts links from the given text using default filters. + * + *

+ * This is a convenience method that uses {@link #DEFAULT_FILTERS}. * - * @param content the content to search through - * @return true if the content contains at least one link + * @param content the text to scan for links + * @return a list of extracted links in the order they appear in the text + * @see #extractLinks(String, Set) */ + public static List extractLinks(String content) { + return extractLinks(content, DEFAULT_FILTERS); + } + + /** + * Checks whether the given text contains at least one detectable URL. + * + *

+ * This method performs a lightweight detection only and does not apply any {@link LinkFilter}s. + * + * @param content the text to scan + * @return {@code true} if at least one URL-like pattern is detected + */ + public static boolean containsLink(String content) { return !(new UrlDetector(content, UrlDetectorOptions.BRACKET_MATCH).detect().isEmpty()); } + /** + * Asynchronously checks whether a URL is considered broken. + * + *

+ * A link is considered broken if: + *

+ * + *

+ * Successful responses (2xx) and redirects (3xx) are considered valid links. The response body + * is never inspected. + * + * @param url the URL to check + * @return a {@code CompletableFuture} completing with {@code true} if the link is broken, + * {@code false} otherwise + */ + + public static CompletableFuture isLinkBroken(String url) { + // Try HEAD request first (cheap and fast) + HttpRequest headRequest = HttpRequest.newBuilder(URI.create(url)) + .method("HEAD", HttpRequest.BodyPublishers.noBody()) + .build(); + + return HTTP_CLIENT.sendAsync(headRequest, HttpResponse.BodyHandlers.discarding()) + .thenApply(response -> { + int status = response.statusCode(); + // 2xx and 3xx are success, 4xx and 5xx are errors + return status >= 400; + }) + .exceptionally(_ -> true) + .thenCompose(result -> { + if (!Boolean.TRUE.equals(result)) { + return CompletableFuture.completedFuture(false); + } + // If HEAD fails, fall back to GET request (some servers don't support HEAD) + HttpRequest fallbackGetRequest = + HttpRequest.newBuilder(URI.create(url)).GET().build(); + return HTTP_CLIENT + .sendAsync(fallbackGetRequest, HttpResponse.BodyHandlers.discarding()) + .thenApply(resp -> resp.statusCode() >= 400) + .exceptionally(_ -> true); + }); + } + + /** + * Replaces all broken links in the given text. + * + *

+ * Each detected link is checked asynchronously using {@link #isLinkBroken(String)}. Only links + * confirmed as broken are replaced. Duplicate URLs are checked only once and all occurrences + * are replaced if found to be broken. + * + *

+ * This method does not block - all link checks are performed asynchronously and combined into a + * single {@code CompletableFuture}. + * + *

+ * Example: + * + *

{@code
+     * replaceBrokenLinks("""
+     *           Test
+     *           http://deadlink/1
+     *           http://workinglink/1
+     *         """, "(broken link)")
+     * }
+ * + *

+ * Results in: + * + *

{@code
+     * Test
+     * (broken link)
+     * http://workinglink/1
+     * }
+ * + * @param text the input text containing URLs + * @param replacement the string used to replace broken links + * @return a {@code CompletableFuture} that completes with the modified text, or the original + * text if no broken links were found + */ + + public static CompletableFuture replaceBrokenLinks(String text, String replacement) { + List links = extractLinks(text, DEFAULT_FILTERS); + + if (links.isEmpty()) { + return CompletableFuture.completedFuture(text); + } + + List>> brokenLinkFutures = links.stream() + .distinct() + .map(link -> isLinkBroken(link) + .thenApply(isBroken -> Boolean.TRUE.equals(isBroken) ? Optional.of(link) : Optional.empty())) + .toList(); + + return CompletableFuture.allOf(brokenLinkFutures.toArray(new CompletableFuture[0])) + .thenApply(_ -> brokenLinkFutures.stream() + .map(CompletableFuture::join) + .flatMap(Optional::stream) + .toList()) + .thenApply(brokenLinks -> { + String result = text; + for (String brokenLink : brokenLinks) { + result = result.replace(brokenLink, replacement); + } + return result; + }); + } + + /** + * Converts a detected {@link Url} into a normalized link string. + * + *

+ * Applies the provided {@link LinkFilter}s. Additionally removes trailing punctuation such as + * commas or periods from the detected URL. + * + * @param url the detected URL + * @param filter active link filters to apply + * @return an {@link Optional} containing the normalized link, or {@code Optional.empty()} if + * the link should be filtered out + */ + private static Optional toLink(Url url, Set filter) { String raw = url.getOriginalUrl(); if (filter.contains(LinkFilter.SUPPRESSED) && raw.contains(">")) { @@ -76,8 +268,6 @@ private static Optional toLink(Url url, Set filter) { // Remove trailing punctuation link = link.substring(0, link.length() - 1); } - return Optional.of(link); } - -} +} \ No newline at end of file