From 29b41d1cfad657ad51327443174a395903849cb4 Mon Sep 17 00:00:00 2001 From: qiancai Date: Fri, 15 May 2026 10:39:39 +0800 Subject: [PATCH 1/3] improve lychee link checks --- .../scripts/extract-changed-markdown-lines.pl | 46 +++++++++++++ .github/scripts/extract-site-hrefs.pl | 55 ++++++++++++++++ .github/workflows/link-fail-fast.yaml | 47 +++++++++---- .github/workflows/link.yaml | 66 +++++++++++++++++-- .lycheeignore | 5 +- 5 files changed, 200 insertions(+), 19 deletions(-) create mode 100644 .github/scripts/extract-changed-markdown-lines.pl create mode 100644 .github/scripts/extract-site-hrefs.pl diff --git a/.github/scripts/extract-changed-markdown-lines.pl b/.github/scripts/extract-changed-markdown-lines.pl new file mode 100644 index 0000000000000..664bd86822a43 --- /dev/null +++ b/.github/scripts/extract-changed-markdown-lines.pl @@ -0,0 +1,46 @@ +use strict; +use warnings; +use File::Basename qw(dirname); +use File::Path qw(make_path); + +my ($out_root, $list_path) = @ARGV; +die "usage: $0 OUT_ROOT LIST_PATH\n" unless defined $out_root && defined $list_path; + +my %added_lines_by_file; +my %has_link_candidate; +my $file; + +while (my $line = ) { + chomp $line; + + if ($line =~ m{^\+\+\+ b/(.+)$}) { + $file = $1; + next; + } + + next unless defined $file; + next unless $line =~ /^\+(?!\+\+)(.*)$/; + + my $content = $1; + push @{$added_lines_by_file{$file}}, $content; + $has_link_candidate{$file} = 1 if $content =~ m{https?://}i || $content =~ /\bhref\s*=/i; +} + +make_path($out_root); +open my $list_fh, ">", $list_path or die "cannot write $list_path: $!"; + +for my $file (sort keys %added_lines_by_file) { + next unless $has_link_candidate{$file}; + next if $file =~ m{(?:^|/)\.\.(?:/|$)}; + + my $out_path = "$out_root/$file"; + make_path(dirname($out_path)); + open my $out_fh, ">", $out_path or die "cannot write $out_path: $!"; + for my $line (@{$added_lines_by_file{$file}}) { + print {$out_fh} "$line\n"; + } + close $out_fh; + print {$list_fh} "$out_path\n"; +} + +close $list_fh; diff --git a/.github/scripts/extract-site-hrefs.pl b/.github/scripts/extract-site-hrefs.pl new file mode 100644 index 0000000000000..36c1d80cdeee1 --- /dev/null +++ b/.github/scripts/extract-site-hrefs.pl @@ -0,0 +1,55 @@ +use strict; +use warnings; +use File::Basename qw(dirname); +use File::Path qw(make_path); + +my ($out_root, $list_path) = @ARGV; +die "usage: $0 OUT_ROOT LIST_PATH\n" unless defined $out_root && defined $list_path; + +my $site_base_url = $ENV{DOCS_SITE_BASE_URL}; +die "DOCS_SITE_BASE_URL is not set\n" unless defined $site_base_url && $site_base_url ne ""; +$site_base_url =~ s{/+\z}{}; + +make_path($out_root); +open my $list_fh, ">", $list_path or die "cannot write $list_path: $!"; + +local $/ = "\0"; +while (my $file = ) { + chomp $file; + next if $file =~ m{(?:^|/)\.\.(?:/|$)}; + next unless -f $file; + + open my $in_fh, "<", $file or die "cannot read $file: $!"; + my %seen; + while (my $line = <$in_fh>) { + while ($line =~ /\bhref\s*=\s*(["'])(.*?)\1/gi) { + my $href = $2; + $href =~ s/^\s+|\s+$//g; + next if $href eq ""; + next if $href =~ m{^https?://}i; + next if $href =~ m{^(?:#|mailto:|tel:|javascript:|data:)}i; + + my $url; + if ($href =~ m{^//}) { + $url = "https:$href"; + } elsif ($href =~ m{^/}) { + $url = "$site_base_url$href"; + } else { + $url = "$site_base_url/$href"; + } + $seen{$url} = 1; + } + } + close $in_fh; + + next unless %seen; + my $out_path = "$out_root/$file"; + make_path(dirname($out_path)); + open my $out_fh, ">", $out_path or die "cannot write $out_path: $!"; + for my $url (sort keys %seen) { + print {$out_fh} "<$url>\n"; + } + close $out_fh; + print {$list_fh} "$out_path\n"; +} +close $list_fh; diff --git a/.github/workflows/link-fail-fast.yaml b/.github/workflows/link-fail-fast.yaml index ede0b16ba6086..b2d936adcd343 100644 --- a/.github/workflows/link-fail-fast.yaml +++ b/.github/workflows/link-fail-fast.yaml @@ -1,8 +1,14 @@ -name: Links (Fail Fast) +name: Check external URLs in changed lines of a PR on: pull_request: +env: + DOCS_SITE_BASE_URL: "https://docs.pingcap.com" + +permissions: + contents: read + jobs: linkChecker: runs-on: ubuntu-latest @@ -11,25 +17,40 @@ jobs: with: fetch-depth: 2 - - name: 'Get a list of changed markdown files to process' - id: changed-files + - name: Collect changed markdown lines with links + id: changed-lines run: | - CHANGED_FILES=$(git diff-tree --name-only --diff-filter 'AM' -r HEAD^1 HEAD -- "*.md" | sed -z "s/\n$//;s/\n/' '/g") - echo "all_changed_files=${CHANGED_FILES}" >> $GITHUB_OUTPUT + git -c core.quotePath=false diff --unified=0 --diff-filter=AM --no-ext-diff --no-color HEAD^1 HEAD -- '*.md' | + perl .github/scripts/extract-changed-markdown-lines.pl .lychee-pr-changed-lines .lychee-pr-inputs.txt - - name: Restore lychee cache - uses: actions/cache@v4 - with: - path: .lycheecache - key: cache-lychee-${{ github.sha }} - restore-keys: cache-lychee- + count=$(wc -l < .lychee-pr-inputs.txt | tr -d ' ') + echo "count=${count}" >> "$GITHUB_OUTPUT" + + if [ "$count" -gt 0 ]; then + echo "has_inputs=true" >> "$GITHUB_OUTPUT" + sed 's/^/- /' .lychee-pr-inputs.txt + else + echo "has_inputs=false" >> "$GITHUB_OUTPUT" + fi + + - name: Collect doc site href URLs + if: ${{ steps.changed-lines.outputs.has_inputs == 'true' }} + run: | + tr '\n' '\0' < .lychee-pr-inputs.txt | + perl .github/scripts/extract-site-hrefs.pl .lychee-site-hrefs .lychee-site-href-files.txt + + count=$(wc -l < .lychee-site-href-files.txt | tr -d ' ') + if [ "$count" -gt 0 ]; then + cat .lychee-site-href-files.txt >> .lychee-pr-inputs.txt + sed 's/^/- /' .lychee-site-href-files.txt + fi - name: Link Checker - if: ${{ steps.changed-files.outputs.all_changed_files }} + if: ${{ steps.changed-lines.outputs.has_inputs == 'true' }} uses: lycheeverse/lychee-action@v2 with: fail: true failIfEmpty: false - args: --root-dir $(pwd) --cache --max-cache-age 1d -E -i -n -t 45 -- '${{ steps.changed-files.outputs.all_changed_files }}' + args: --root-dir $(pwd) --exclude '^file://' -E -i -n -t 45 --files-from .lychee-pr-inputs.txt env: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} diff --git a/.github/workflows/link.yaml b/.github/workflows/link.yaml index b690b6675f777..bfe58bf19c37c 100644 --- a/.github/workflows/link.yaml +++ b/.github/workflows/link.yaml @@ -1,4 +1,4 @@ -name: Links +name: Check external URLs in all files on: repository_dispatch: @@ -6,6 +6,13 @@ on: schedule: - cron: "0 0 * * 1" +env: + DOCS_SITE_BASE_URL: "https://docs.pingcap.com" + +permissions: + contents: read + issues: write + jobs: linkChecker: if: github.repository == 'pingcap/docs' @@ -15,7 +22,7 @@ jobs: - name: Download Exclude Path run: | - curl https://raw.githubusercontent.com/pingcap/docs/master/.lycheeignore --output .lycheeignore + curl -fsSL https://raw.githubusercontent.com/pingcap/docs/master/.lycheeignore --output .lycheeignore - name: Restore lychee cache uses: actions/cache@v4 @@ -28,14 +35,63 @@ jobs: uses: lycheeverse/lychee-action@v2 with: # Don't fail as we want the workflow to continue and run 'Create Issue From File' - # Excluding releases paths as historic releases may have outdated links. fail: false failIfEmpty: false - args: --root-dir $(pwd) --cache --max-cache-age 8d -E -i -n -t 45 --exclude-path '^./releases/' --exclude-path '^./tidb-cloud/releases/' --exclude-path '^./resources/' . - output: out.md + args: --root-dir $(pwd) --cache --max-cache-age 8d --cache-exclude-status '..200,300..' --exclude '^file://' -E -i -n -t 45 --exclude-path '^\./releases/' --exclude-path '^\./tidb-cloud/releases/' --exclude-path '^\./resources/' . + output: out-external.md env: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} + - name: Collect doc site href URLs + id: site-hrefs + run: | + git ls-files -z -- \ + '*.md' '*.mdx' '*.markdown' '*.mkd' '*.mdown' '*.mdwn' '*.mkdn' '*.mkdown' \ + '*.html' '*.htm' '*.css' '*.txt' | + perl -0ne 'print unless m{^(?:releases|tidb-cloud/releases|resources)/}' | + perl .github/scripts/extract-site-hrefs.pl .lychee-site-hrefs .lychee-site-href-files.txt + + count=$(wc -l < .lychee-site-href-files.txt | tr -d ' ') + echo "count=${count}" >> "$GITHUB_OUTPUT" + + if [ "$count" -gt 0 ]; then + echo "has_hrefs=true" >> "$GITHUB_OUTPUT" + sed 's/^/- /' .lychee-site-href-files.txt + else + echo "has_hrefs=false" >> "$GITHUB_OUTPUT" + fi + + - name: Check site href URLs + if: ${{ steps.site-hrefs.outputs.has_hrefs == 'true' }} + uses: lycheeverse/lychee-action@v2 + with: + # Don't fail as we want the workflow to continue and run 'Create Issue From File' + fail: false + failIfEmpty: false + args: --cache --max-cache-age 8d --cache-exclude-status '..200,300..' -E -i -n -t 45 --files-from .lychee-site-href-files.txt + output: out-site-hrefs.md + env: + GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} + + - name: Combine Link Reports + run: | + { + echo "# External URL Check" + echo + if [ -f out-external.md ]; then + cat out-external.md + else + echo "*(external link check did not produce output)*" + fi + + if [ -f out-site-hrefs.md ]; then + echo + echo "# Site href URL Check" + echo + cat out-site-hrefs.md + fi + } > out.md + - name: Create Issue From File uses: peter-evans/create-issue-from-file@v6 with: diff --git a/.lycheeignore b/.lycheeignore index 7b625949c0fe7..dd89727f9573e 100644 --- a/.lycheeignore +++ b/.lycheeignore @@ -33,13 +33,16 @@ https://platform\.openai\.com/api-keys https://openai\.com/.* https://jwt\.io/ https://typeorm\.io/.* +https://dl\.acm\.org/doi/10\.1145/(1988842\.1988850|2588555\.2610507) +https://developer\.salesforce\.com/.* +https?://(www\.)?npmjs\.com/package/.* https://dash\.cloudflare\.com/.* https://centminmod\.com/mydumper\.html https://docs\.pingcap\.com/tidb/v6\.6/system-variables#tidb_pessimistic_txn_aggressive_locking-new-in-v660 https://docs\.pingcap\.com/tidb/v7\.6/system-variables#tidb_ddl_version-new-in-v760 https://developers\.redhat\.com/blog/2021/01/05/building-red-hat-enterprise-linux-9-for-the-x86-64-v2-microarchitecture-level -https://.*github.*/%7B%7B%7B%20.tidb_operator_version%20%7D%7D%7D https://.*github.*/%7B%7B%7B.tidb-operator-version%7D%7D%7D +https://console\.cloud\.google\.com/.* https://portal\.azure\.com/.* https://azuremarketplace\.microsoft\.com/.* https://one\.newrelic\.com/.* From 843441ecc526fadfa261492c1a5d71ca7b41aadf Mon Sep 17 00:00:00 2001 From: qiancai Date: Fri, 15 May 2026 13:20:46 +0800 Subject: [PATCH 2/3] address href extraction review --- .github/scripts/extract-site-hrefs.pl | 49 ++++++++++++++------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/.github/scripts/extract-site-hrefs.pl b/.github/scripts/extract-site-hrefs.pl index 36c1d80cdeee1..605d618eec322 100644 --- a/.github/scripts/extract-site-hrefs.pl +++ b/.github/scripts/extract-site-hrefs.pl @@ -13,21 +13,25 @@ make_path($out_root); open my $list_fh, ">", $list_path or die "cannot write $list_path: $!"; -local $/ = "\0"; -while (my $file = ) { - chomp $file; - next if $file =~ m{(?:^|/)\.\.(?:/|$)}; - next unless -f $file; - - open my $in_fh, "<", $file or die "cannot read $file: $!"; - my %seen; - while (my $line = <$in_fh>) { - while ($line =~ /\bhref\s*=\s*(["'])(.*?)\1/gi) { +{ + local $/ = "\0"; + while (my $file = ) { + chomp $file; + next if $file =~ m{(?:^|/)\.\.(?:/|$)}; + next unless -f $file; + + open my $in_fh, "<", $file or die "cannot read $file: $!"; + my $content = do { local $/; <$in_fh> }; + close $in_fh; + next unless defined $content; + + my %seen; + while ($content =~ /\bhref\s*=\s*(["'])(.*?)\1/gi) { my $href = $2; $href =~ s/^\s+|\s+$//g; next if $href eq ""; next if $href =~ m{^https?://}i; - next if $href =~ m{^(?:#|mailto:|tel:|javascript:|data:)}i; + next if $href =~ m{^(?:#|[a-z][a-z0-9+.-]*:)}i; my $url; if ($href =~ m{^//}) { @@ -35,21 +39,20 @@ } elsif ($href =~ m{^/}) { $url = "$site_base_url$href"; } else { - $url = "$site_base_url/$href"; + next; } $seen{$url} = 1; } + + next unless %seen; + my $out_path = "$out_root/$file"; + make_path(dirname($out_path)); + open my $out_fh, ">", $out_path or die "cannot write $out_path: $!"; + for my $url (sort keys %seen) { + print {$out_fh} "<$url>\n"; + } + close $out_fh; + print {$list_fh} "$out_path\n"; } - close $in_fh; - - next unless %seen; - my $out_path = "$out_root/$file"; - make_path(dirname($out_path)); - open my $out_fh, ">", $out_path or die "cannot write $out_path: $!"; - for my $url (sort keys %seen) { - print {$out_fh} "<$url>\n"; - } - close $out_fh; - print {$list_fh} "$out_path\n"; } close $list_fh; From 1d37c20daadd6a6bbaaf5743273ac4947d3acd16 Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Fri, 15 May 2026 14:46:15 +0800 Subject: [PATCH 3/3] Update .github/workflows/link-fail-fast.yaml --- .github/workflows/link-fail-fast.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/link-fail-fast.yaml b/.github/workflows/link-fail-fast.yaml index b2d936adcd343..9e881c3d45a39 100644 --- a/.github/workflows/link-fail-fast.yaml +++ b/.github/workflows/link-fail-fast.yaml @@ -1,4 +1,4 @@ -name: Check external URLs in changed lines of a PR +name: ci / external-links-in-changed-lines (pull_request) on: pull_request: