From d11f54ac48b1e470553460f9725ec863ece0164a Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Fri, 15 May 2026 14:58:06 +0800 Subject: [PATCH 1/2] This is an automated cherry-pick of #22894 Signed-off-by: ti-chi-bot --- .../scripts/extract-changed-markdown-lines.pl | 46 +++++++++++++ .github/scripts/extract-site-hrefs.pl | 58 ++++++++++++++++ .github/workflows/link-fail-fast.yaml | 48 ++++++++++++-- .github/workflows/link.yaml | 66 +++++++++++++++++-- .lycheeignore | 7 ++ 5 files changed, 215 insertions(+), 10 deletions(-) create mode 100644 .github/scripts/extract-changed-markdown-lines.pl create mode 100644 .github/scripts/extract-site-hrefs.pl diff --git a/.github/scripts/extract-changed-markdown-lines.pl b/.github/scripts/extract-changed-markdown-lines.pl new file mode 100644 index 0000000000000..664bd86822a43 --- /dev/null +++ b/.github/scripts/extract-changed-markdown-lines.pl @@ -0,0 +1,46 @@ +use strict; +use warnings; +use File::Basename qw(dirname); +use File::Path qw(make_path); + +my ($out_root, $list_path) = @ARGV; +die "usage: $0 OUT_ROOT LIST_PATH\n" unless defined $out_root && defined $list_path; + +my %added_lines_by_file; +my %has_link_candidate; +my $file; + +while (my $line = ) { + chomp $line; + + if ($line =~ m{^\+\+\+ b/(.+)$}) { + $file = $1; + next; + } + + next unless defined $file; + next unless $line =~ /^\+(?!\+\+)(.*)$/; + + my $content = $1; + push @{$added_lines_by_file{$file}}, $content; + $has_link_candidate{$file} = 1 if $content =~ m{https?://}i || $content =~ /\bhref\s*=/i; +} + +make_path($out_root); +open my $list_fh, ">", $list_path or die "cannot write $list_path: $!"; + +for my $file (sort keys %added_lines_by_file) { + next unless $has_link_candidate{$file}; + next if $file =~ m{(?:^|/)\.\.(?:/|$)}; + + my $out_path = "$out_root/$file"; + make_path(dirname($out_path)); + open my $out_fh, ">", $out_path or die "cannot write $out_path: $!"; + for my $line (@{$added_lines_by_file{$file}}) { + print {$out_fh} "$line\n"; + } + close $out_fh; + print {$list_fh} "$out_path\n"; +} + +close $list_fh; diff --git a/.github/scripts/extract-site-hrefs.pl b/.github/scripts/extract-site-hrefs.pl new file mode 100644 index 0000000000000..605d618eec322 --- /dev/null +++ b/.github/scripts/extract-site-hrefs.pl @@ -0,0 +1,58 @@ +use strict; +use warnings; +use File::Basename qw(dirname); +use File::Path qw(make_path); + +my ($out_root, $list_path) = @ARGV; +die "usage: $0 OUT_ROOT LIST_PATH\n" unless defined $out_root && defined $list_path; + +my $site_base_url = $ENV{DOCS_SITE_BASE_URL}; +die "DOCS_SITE_BASE_URL is not set\n" unless defined $site_base_url && $site_base_url ne ""; +$site_base_url =~ s{/+\z}{}; + +make_path($out_root); +open my $list_fh, ">", $list_path or die "cannot write $list_path: $!"; + +{ + local $/ = "\0"; + while (my $file = ) { + chomp $file; + next if $file =~ m{(?:^|/)\.\.(?:/|$)}; + next unless -f $file; + + open my $in_fh, "<", $file or die "cannot read $file: $!"; + my $content = do { local $/; <$in_fh> }; + close $in_fh; + next unless defined $content; + + my %seen; + while ($content =~ /\bhref\s*=\s*(["'])(.*?)\1/gi) { + my $href = $2; + $href =~ s/^\s+|\s+$//g; + next if $href eq ""; + next if $href =~ m{^https?://}i; + next if $href =~ m{^(?:#|[a-z][a-z0-9+.-]*:)}i; + + my $url; + if ($href =~ m{^//}) { + $url = "https:$href"; + } elsif ($href =~ m{^/}) { + $url = "$site_base_url$href"; + } else { + next; + } + $seen{$url} = 1; + } + + next unless %seen; + my $out_path = "$out_root/$file"; + make_path(dirname($out_path)); + open my $out_fh, ">", $out_path or die "cannot write $out_path: $!"; + for my $url (sort keys %seen) { + print {$out_fh} "<$url>\n"; + } + close $out_fh; + print {$list_fh} "$out_path\n"; + } +} +close $list_fh; diff --git a/.github/workflows/link-fail-fast.yaml b/.github/workflows/link-fail-fast.yaml index 9f3466d0c899b..3d82a701f0204 100644 --- a/.github/workflows/link-fail-fast.yaml +++ b/.github/workflows/link-fail-fast.yaml @@ -1,8 +1,14 @@ -name: Links (Fail Fast) +name: ci / external-links-in-changed-lines (pull_request) on: pull_request: +env: + DOCS_SITE_BASE_URL: "https://docs.pingcap.com" + +permissions: + contents: read + jobs: linkChecker: runs-on: ubuntu-latest @@ -11,17 +17,49 @@ jobs: with: fetch-depth: 2 - - name: 'Get a list of changed markdown files to process' - id: changed-files + - name: Collect changed markdown lines with links + id: changed-lines run: | - CHANGED_FILES=$(git diff-tree --name-only --diff-filter 'AM' -r HEAD^1 HEAD -- "*.md" | sed -z "s/\n$//;s/\n/' '/g") - echo "all_changed_files=${CHANGED_FILES}" >> $GITHUB_OUTPUT + git -c core.quotePath=false diff --unified=0 --diff-filter=AM --no-ext-diff --no-color HEAD^1 HEAD -- '*.md' | + perl .github/scripts/extract-changed-markdown-lines.pl .lychee-pr-changed-lines .lychee-pr-inputs.txt +<<<<<<< HEAD - name: Link Checker if: ${{ steps.changed-files.outputs.all_changed_files }} uses: lycheeverse/lychee-action@v2.3.0 with: fail: true args: --root-dir $(pwd) -E -i -n -t 45 -- '${{ steps.changed-files.outputs.all_changed_files }}' +======= + count=$(wc -l < .lychee-pr-inputs.txt | tr -d ' ') + echo "count=${count}" >> "$GITHUB_OUTPUT" + + if [ "$count" -gt 0 ]; then + echo "has_inputs=true" >> "$GITHUB_OUTPUT" + sed 's/^/- /' .lychee-pr-inputs.txt + else + echo "has_inputs=false" >> "$GITHUB_OUTPUT" + fi + + - name: Collect doc site href URLs + if: ${{ steps.changed-lines.outputs.has_inputs == 'true' }} + run: | + tr '\n' '\0' < .lychee-pr-inputs.txt | + perl .github/scripts/extract-site-hrefs.pl .lychee-site-hrefs .lychee-site-href-files.txt + + count=$(wc -l < .lychee-site-href-files.txt | tr -d ' ') + if [ "$count" -gt 0 ]; then + cat .lychee-site-href-files.txt >> .lychee-pr-inputs.txt + sed 's/^/- /' .lychee-site-href-files.txt + fi + + - name: Link Checker + if: ${{ steps.changed-lines.outputs.has_inputs == 'true' }} + uses: lycheeverse/lychee-action@v2 + with: + fail: true + failIfEmpty: false + args: --root-dir $(pwd) --exclude '^file://' -E -i -n -t 45 --files-from .lychee-pr-inputs.txt +>>>>>>> 145d861113 (workflow: optimize external link checks (#22894)) env: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} diff --git a/.github/workflows/link.yaml b/.github/workflows/link.yaml index a6e8c7b0f295f..208d9c192a570 100644 --- a/.github/workflows/link.yaml +++ b/.github/workflows/link.yaml @@ -1,4 +1,4 @@ -name: Links +name: Check external URLs in all files on: repository_dispatch: @@ -6,6 +6,13 @@ on: schedule: - cron: "0 0 * * 1" +env: + DOCS_SITE_BASE_URL: "https://docs.pingcap.com" + +permissions: + contents: read + issues: write + jobs: linkChecker: runs-on: ubuntu-latest @@ -14,20 +21,69 @@ jobs: - name: Download Exclude Path run: | - curl https://raw.githubusercontent.com/pingcap/docs/master/.lycheeignore --output .lycheeignore + curl -fsSL https://raw.githubusercontent.com/pingcap/docs/master/.lycheeignore --output .lycheeignore - name: Check Links uses: lycheeverse/lychee-action@v1.6.1 with: # Don't fail as we want the workflow to continue and run 'Create Issue From File' - # Excluding releases paths as historic releases may have outdated links. fail: false failIfEmpty: false - args: --root-dir $(pwd) --cache --max-cache-age 8d -E -i -n -t 45 --exclude-path '^./releases/' --exclude-path '^./tidb-cloud/releases/' --exclude-path '^./resources/' . - output: out.md + args: --root-dir $(pwd) --cache --max-cache-age 8d --cache-exclude-status '..200,300..' --exclude '^file://' -E -i -n -t 45 --exclude-path '^\./releases/' --exclude-path '^\./tidb-cloud/releases/' --exclude-path '^\./resources/' . + output: out-external.md + env: + GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} + + - name: Collect doc site href URLs + id: site-hrefs + run: | + git ls-files -z -- \ + '*.md' '*.mdx' '*.markdown' '*.mkd' '*.mdown' '*.mdwn' '*.mkdn' '*.mkdown' \ + '*.html' '*.htm' '*.css' '*.txt' | + perl -0ne 'print unless m{^(?:releases|tidb-cloud/releases|resources)/}' | + perl .github/scripts/extract-site-hrefs.pl .lychee-site-hrefs .lychee-site-href-files.txt + + count=$(wc -l < .lychee-site-href-files.txt | tr -d ' ') + echo "count=${count}" >> "$GITHUB_OUTPUT" + + if [ "$count" -gt 0 ]; then + echo "has_hrefs=true" >> "$GITHUB_OUTPUT" + sed 's/^/- /' .lychee-site-href-files.txt + else + echo "has_hrefs=false" >> "$GITHUB_OUTPUT" + fi + + - name: Check site href URLs + if: ${{ steps.site-hrefs.outputs.has_hrefs == 'true' }} + uses: lycheeverse/lychee-action@v2 + with: + # Don't fail as we want the workflow to continue and run 'Create Issue From File' + fail: false + failIfEmpty: false + args: --cache --max-cache-age 8d --cache-exclude-status '..200,300..' -E -i -n -t 45 --files-from .lychee-site-href-files.txt + output: out-site-hrefs.md env: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} + - name: Combine Link Reports + run: | + { + echo "# External URL Check" + echo + if [ -f out-external.md ]; then + cat out-external.md + else + echo "*(external link check did not produce output)*" + fi + + if [ -f out-site-hrefs.md ]; then + echo + echo "# Site href URL Check" + echo + cat out-site-hrefs.md + fi + } > out.md + - name: Create Issue From File uses: peter-evans/create-issue-from-file@v4 with: diff --git a/.lycheeignore b/.lycheeignore index 30c21151da22c..40bd6fa340d2a 100644 --- a/.lycheeignore +++ b/.lycheeignore @@ -33,14 +33,21 @@ https://platform\.openai\.com/api-keys https://openai\.com/.* https://jwt\.io/ https://typeorm\.io/.* +https://dl\.acm\.org/doi/10\.1145/(1988842\.1988850|2588555\.2610507) +https://developer\.salesforce\.com/.* +https?://(www\.)?npmjs\.com/package/.* https://dash\.cloudflare\.com/.* https://centminmod\.com/mydumper\.html https://docs\.pingcap\.com/tidb/v6\.6/system-variables#tidb_pessimistic_txn_aggressive_locking-new-in-v660 https://docs\.pingcap\.com/tidb/v7\.6/system-variables#tidb_ddl_version-new-in-v760 https://developers\.redhat\.com/blog/2021/01/05/building-red-hat-enterprise-linux-9-for-the-x86-64-v2-microarchitecture-level +<<<<<<< HEAD https://portal\.azure\.com/.* https://.*github.*/%7B%7B%7B%20.tidb_operator_version%20%7D%7D%7D +======= +>>>>>>> 145d861113 (workflow: optimize external link checks (#22894)) https://.*github.*/%7B%7B%7B.tidb-operator-version%7D%7D%7D +https://console\.cloud\.google\.com/.* https://portal\.azure\.com/.* https://azuremarketplace\.microsoft\.com/.* https://one\.newrelic\.com/.* From 1fc71f4ae79d81b04294345940d1783d6455b230 Mon Sep 17 00:00:00 2001 From: qiancai Date: Fri, 15 May 2026 15:12:03 +0800 Subject: [PATCH 2/2] resolve link workflow cherry-pick conflicts --- .github/workflows/link-fail-fast.yaml | 11 +---------- .github/workflows/link.yaml | 14 +++++++++++--- .lycheeignore | 5 ----- 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/.github/workflows/link-fail-fast.yaml b/.github/workflows/link-fail-fast.yaml index 3d82a701f0204..9e881c3d45a39 100644 --- a/.github/workflows/link-fail-fast.yaml +++ b/.github/workflows/link-fail-fast.yaml @@ -13,7 +13,7 @@ jobs: linkChecker: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 2 @@ -23,14 +23,6 @@ jobs: git -c core.quotePath=false diff --unified=0 --diff-filter=AM --no-ext-diff --no-color HEAD^1 HEAD -- '*.md' | perl .github/scripts/extract-changed-markdown-lines.pl .lychee-pr-changed-lines .lychee-pr-inputs.txt -<<<<<<< HEAD - - name: Link Checker - if: ${{ steps.changed-files.outputs.all_changed_files }} - uses: lycheeverse/lychee-action@v2.3.0 - with: - fail: true - args: --root-dir $(pwd) -E -i -n -t 45 -- '${{ steps.changed-files.outputs.all_changed_files }}' -======= count=$(wc -l < .lychee-pr-inputs.txt | tr -d ' ') echo "count=${count}" >> "$GITHUB_OUTPUT" @@ -60,6 +52,5 @@ jobs: fail: true failIfEmpty: false args: --root-dir $(pwd) --exclude '^file://' -E -i -n -t 45 --files-from .lychee-pr-inputs.txt ->>>>>>> 145d861113 (workflow: optimize external link checks (#22894)) env: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} diff --git a/.github/workflows/link.yaml b/.github/workflows/link.yaml index 208d9c192a570..bfe58bf19c37c 100644 --- a/.github/workflows/link.yaml +++ b/.github/workflows/link.yaml @@ -15,16 +15,24 @@ permissions: jobs: linkChecker: + if: github.repository == 'pingcap/docs' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Download Exclude Path run: | curl -fsSL https://raw.githubusercontent.com/pingcap/docs/master/.lycheeignore --output .lycheeignore + - name: Restore lychee cache + uses: actions/cache@v4 + with: + path: .lycheecache + key: cache-lychee-${{ github.sha }} + restore-keys: cache-lychee- + - name: Check Links - uses: lycheeverse/lychee-action@v1.6.1 + uses: lycheeverse/lychee-action@v2 with: # Don't fail as we want the workflow to continue and run 'Create Issue From File' fail: false @@ -85,7 +93,7 @@ jobs: } > out.md - name: Create Issue From File - uses: peter-evans/create-issue-from-file@v4 + uses: peter-evans/create-issue-from-file@v6 with: title: Broken Link Detected content-filepath: out.md diff --git a/.lycheeignore b/.lycheeignore index 40bd6fa340d2a..dd89727f9573e 100644 --- a/.lycheeignore +++ b/.lycheeignore @@ -41,11 +41,6 @@ https://centminmod\.com/mydumper\.html https://docs\.pingcap\.com/tidb/v6\.6/system-variables#tidb_pessimistic_txn_aggressive_locking-new-in-v660 https://docs\.pingcap\.com/tidb/v7\.6/system-variables#tidb_ddl_version-new-in-v760 https://developers\.redhat\.com/blog/2021/01/05/building-red-hat-enterprise-linux-9-for-the-x86-64-v2-microarchitecture-level -<<<<<<< HEAD -https://portal\.azure\.com/.* -https://.*github.*/%7B%7B%7B%20.tidb_operator_version%20%7D%7D%7D -======= ->>>>>>> 145d861113 (workflow: optimize external link checks (#22894)) https://.*github.*/%7B%7B%7B.tidb-operator-version%7D%7D%7D https://console\.cloud\.google\.com/.* https://portal\.azure\.com/.*