Test restartpoints in archive recovery.
authorNoah Misch <noah@leadboat.com>
Sun, 20 Apr 2025 15:28:48 +0000 (08:28 -0700)
committerNoah Misch <noah@leadboat.com>
Sun, 20 Apr 2025 15:28:48 +0000 (08:28 -0700)
v14 commit 1f95181b44c843729caaa688f74babe9403b5850 and its v13
equivalent caused timing-dependent failures in archive recovery, at
restartpoints.  The symptom was "invalid magic number 0000 in log
segment X, offset 0", "unexpected pageaddr X in log segment Y, offset 0"
[X < Y], or an assertion failure.  Commit
3635a0a35aafd3bfa80b7a809bc6e91ccd36606a and predecessors back-patched
v15 changes to fix that.  This test reproduces the problem
probabilistically, typically in less than 1000 iterations of the test.
Hence, buildfarm and CI runs would have surfaced enough failures to get
attention within a day.

Reported-by: Arun Thirupathi <arunth@google.com>
Discussion: https://postgr.es/m/20250306193013.36.nmisch@google.com
Backpatch-through: 13

src/test/recovery/meson.build
src/test/recovery/t/045_archive_restartpoint.pl [new file with mode: 0644]

index 057bcde1434d3cf79b98b03bbaed74c255e66c4e..cb983766c679385747026c9d6f27163deecd6a6a 100644 (file)
@@ -53,6 +53,7 @@ tests += {
       't/042_low_level_backup.pl',
       't/043_no_contrecord_switch.pl',
       't/044_invalidate_inactive_slots.pl',
+      't/045_archive_restartpoint.pl',
     ],
   },
 }
diff --git a/src/test/recovery/t/045_archive_restartpoint.pl b/src/test/recovery/t/045_archive_restartpoint.pl
new file mode 100644 (file)
index 0000000..b143bc4
--- /dev/null
@@ -0,0 +1,57 @@
+
+# Copyright (c) 2024-2025, PostgreSQL Global Development Group
+
+# Test restartpoints during archive recovery.
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $archive_max_mb = 320;
+my $wal_segsize = 1;
+
+# Initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(
+   has_archiving => 1,
+   allows_streaming => 1,
+   extra => [ '--wal-segsize' => $wal_segsize ]);
+$node_primary->start;
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+$node_primary->safe_psql('postgres',
+   ('DO $$BEGIN FOR i IN 1..' . $archive_max_mb / $wal_segsize)
+     . ' LOOP CHECKPOINT; PERFORM pg_switch_wal(); END LOOP; END$$;');
+
+# Force archiving of WAL file containing recovery target
+my $until_lsn = $node_primary->lsn('write');
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal()");
+$node_primary->stop;
+
+# Archive recovery
+my $node_restore = PostgreSQL::Test::Cluster->new('restore');
+$node_restore->init_from_backup($node_primary, $backup_name,
+   has_restoring => 1);
+$node_restore->append_conf('postgresql.conf',
+   "recovery_target_lsn = '$until_lsn'");
+$node_restore->append_conf('postgresql.conf',
+   'recovery_target_action = pause');
+$node_restore->append_conf('postgresql.conf',
+   'max_wal_size = ' . 2 * $wal_segsize);
+$node_restore->append_conf('postgresql.conf', 'log_checkpoints = on');
+
+$node_restore->start;
+
+# Wait until restore has replayed enough data
+my $caughtup_query =
+  "SELECT '$until_lsn'::pg_lsn <= pg_last_wal_replay_lsn()";
+$node_restore->poll_query_until('postgres', $caughtup_query)
+  or die "Timed out while waiting for restore to catch up";
+
+$node_restore->stop;
+ok(1, 'restore caught up');
+
+done_testing();