freeleaps-ops/apps/pg-brain-split-recover/repmgr-split-brain-recovery.sh

#!/bin/bash
# filepath: repmgr-split-brain-recovery.sh

set -e

NAMESPACE="freeleaps-prod"
STATEFULSET="freeleaps-prod-gitea-postgresql-ha-postgresql"
HEADLESS_SVC="${STATEFULSET}-headless.${NAMESPACE}.svc.freeleaps.cluster"
REPMGR_USER="repmgr"
REPMGR_PASSWORD="WGZ47gbUTLvo"
POSTGRES_PASSWORD="X9H2*9M2ZWYmuZ"
REPMGR_DB="repmgr"
POSTGRES_USER="postgres"
BACKUP_DIR="/tmp/pg_backup_$(date +%Y%m%d_%H%M%S)"
LOCAL_BACKUP_DIR="./pg_backups_$(date +%Y%m%d_%H%M%S)"

echo "===== PostgreSQL Repmgr Split-Brain Recovery ====="
echo "This script will attempt to fix the repmgr split-brain issue"
echo ""

# Create local backup directory
mkdir -p $LOCAL_BACKUP_DIR

# Function to run commands in a pod
run_in_pod() {
  local pod=$1
  local cmd=$2
  kubectl exec -n $NAMESPACE $pod -- bash -c "$cmd"
}

# Function to get PostgreSQL WAL position
get_wal_position() {
  local pod=$1
  run_in_pod $pod "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT pg_current_wal_lsn();\""
}

# Function to check if node is primary
is_primary() {
  local pod=$1
  local result=$(run_in_pod $pod "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT pg_is_in_recovery();\"")
  if [[ $result == *"f"* ]]; then
    return 0  # is primary
  else
    return 1  # is standby
  fi
}

# Function to backup databases from a pod
backup_databases() {
  local pod=$1
  local backup_path="$BACKUP_DIR/$pod"
  
  echo "Creating backup directory in the pod..."
  run_in_pod $pod "mkdir -p $backup_path"
  
  echo "Getting list of databases..."
  local databases=$(run_in_pod $pod "PGPASSWORD=$POSTGRES_PASSWORD psql -U $POSTGRES_USER -t -c \"SELECT datname FROM pg_database WHERE datname NOT IN ('template0', 'template1', 'postgres')\" | tr -d ' '")
  
  echo "Backing up databases: $databases"
  for db in $databases; do
    echo "Backing up database: $db"
    run_in_pod $pod "PGPASSWORD=$POSTGRES_PASSWORD pg_dump -U $POSTGRES_USER -Fc $db > $backup_path/${db}.dump"
  done
  
  # Also backup global objects (roles, tablespaces)
  echo "Backing up global objects..."
  run_in_pod $pod "PGPASSWORD=$POSTGRES_PASSWORD pg_dumpall -U $POSTGRES_USER --globals-only > $backup_path/globals.sql"
  
  # Backup PostgreSQL configuration
  echo "Backing up PostgreSQL configuration..."
  run_in_pod $pod "cp /bitnami/postgresql/conf/postgresql.conf $backup_path/ 2>/dev/null || true"
  run_in_pod $pod "cp /bitnami/postgresql/conf/pg_hba.conf $backup_path/ 2>/dev/null || true"
  
  # Copy repmgr configuration
  echo "Backing up repmgr configuration..."
  run_in_pod $pod "cp /etc/repmgr.conf $backup_path/ 2>/dev/null || true"
  
  # Tar the backup files
  echo "Creating archive of the backup..."
  run_in_pod $pod "tar -czf ${backup_path}.tar.gz -C $(dirname $backup_path) $(basename $backup_path)"
  
  # Copy backup to local machine
  echo "Copying backup to local machine..."
  kubectl cp $NAMESPACE/$pod:${backup_path}.tar.gz $LOCAL_BACKUP_DIR/${pod}_backup.tar.gz
  
  # Cleanup backup in the pod
  echo "Cleaning up backup files in the pod..."
  run_in_pod $pod "rm -rf $backup_path ${backup_path}.tar.gz"
}

echo "Step 0: Checking current status of the cluster..."
for i in 0 1 2; do
  POD="${STATEFULSET}-${i}"
  echo -n "Node ${i} ($POD): "
  
  # Check if node is running as primary
  if is_primary $POD; then
    PRIMARY_STATE="running as primary"
    echo "$PRIMARY_STATE"
  else
    echo "running as standby"
  fi
  
  # Get WAL position
  WAL_POS=$(get_wal_position $POD 2>/dev/null || echo "N/A")
  if [ "$WAL_POS" != "N/A" ]; then
    echo "  - WAL position: $WAL_POS"
    # Store WAL positions for comparison
    declare "WAL_POS_${i}=$WAL_POS"
  fi
done

echo ""
echo "Step 1: Backing up all databases from each node..."
for i in 0 1 2; do
  POD="${STATEFULSET}-${i}"
  echo "Backing up data from node $i ($POD)..."
  backup_databases $POD
done

echo "All backups completed and stored in: $LOCAL_BACKUP_DIR"
echo ""

echo "Determining most advanced node based on WAL position..."

# Get the primary nodes from each pod - there might be more than one in split-brain
for i in 0 1 2; do
  POD="${STATEFULSET}-${i}"
  # Get node information
  NODE_INFO=$(run_in_pod $POD "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT node_id, node_name, type, active FROM repmgr.nodes WHERE node_name = '$POD';\"" 2>/dev/null || echo "")
  
  if [ -n "$NODE_INFO" ]; then
    echo "Node ${i} info: $NODE_INFO"
    
    # Store if this node thinks it's a primary
    if [[ $NODE_INFO == *"primary"* ]]; then
      echo "Node ${i} is configured as a primary"
      declare "NODE_${i}_IS_PRIMARY=true"
    else
      declare "NODE_${i}_IS_PRIMARY=false"
    fi
    
    # Check if node is actually running as primary using pg_is_in_recovery()
    if is_primary $POD; then
      echo "Node ${i} is running as primary (pg_is_in_recovery=false)"
      declare "NODE_${i}_RUNNING_AS_PRIMARY=true"
    else
      declare "NODE_${i}_RUNNING_AS_PRIMARY=false"
    fi
  else
    echo "Could not get info for node ${i}"
    declare "NODE_${i}_IS_PRIMARY=false"
    declare "NODE_${i}_RUNNING_AS_PRIMARY=false"
  fi
done

echo ""
echo "Analyzing WAL positions to determine the most advanced node..."

# Compare WAL positions
if [ -n "${WAL_POS_0}" ] && [ -n "${WAL_POS_1}" ] && [ -n "${WAL_POS_2}" ]; then
  # We have all WAL positions, find the most advanced
  if run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_0}' > '${WAL_POS_1}' AND '${WAL_POS_0}' > '${WAL_POS_2}';\"" | grep -q 't'; then
    NEW_PRIMARY=0
  elif run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_1}' > '${WAL_POS_2}';\"" | grep -q 't'; then
    NEW_PRIMARY=1
  else
    NEW_PRIMARY=2
  fi
elif [ -n "${WAL_POS_0}" ] && [ -n "${WAL_POS_1}" ]; then
  # Only nodes 0 and 1 have WAL positions
  if run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_0}' > '${WAL_POS_1}';\"" | grep -q 't'; then
    NEW_PRIMARY=0
  else
    NEW_PRIMARY=1
  fi
elif [ -n "${WAL_POS_0}" ] && [ -n "${WAL_POS_2}" ]; then
  # Only nodes 0 and 2 have WAL positions
  if run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_0}' > '${WAL_POS_2}';\"" | grep -q 't'; then
    NEW_PRIMARY=0
  else
    NEW_PRIMARY=2
  fi
elif [ -n "${WAL_POS_1}" ] && [ -n "${WAL_POS_2}" ]; then
  # Only nodes 1 and 2 have WAL positions
  if run_in_pod ${STATEFULSET}-1 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_1}' > '${WAL_POS_2}';\"" | grep -q 't'; then
    NEW_PRIMARY=1
  else
    NEW_PRIMARY=2
  fi
elif [ -n "${WAL_POS_0}" ]; then
  NEW_PRIMARY=0
elif [ -n "${WAL_POS_1}" ]; then
  NEW_PRIMARY=1
elif [ -n "${WAL_POS_2}" ]; then
  NEW_PRIMARY=2
else
  echo "Could not determine most advanced node. Using node 0 as default primary."
  NEW_PRIMARY=0
fi

echo "Selected node ${NEW_PRIMARY} as the new primary based on WAL position."
# Fix the bad substitution by using proper indirection
eval WAL_POS_VALUE=\$WAL_POS_${NEW_PRIMARY}
if [ -n "$WAL_POS_VALUE" ]; then
  echo "WAL position: $WAL_POS_VALUE"
fi
echo ""

# Confirm with user
read -p "Backups completed. Do you want to proceed with fixing the split-brain issue? (y/n): " CONFIRM
if [[ "$CONFIRM" != "y" ]]; then
  echo "Operation cancelled. Backups are still available at $LOCAL_BACKUP_DIR"
  exit 1
fi

echo ""
echo "Step 2: Registering node ${NEW_PRIMARY} as primary..."
PRIMARY_POD="${STATEFULSET}-${NEW_PRIMARY}"

# Create a temporary script to run repmgr commands
run_in_pod $PRIMARY_POD "cat > /tmp/register_primary.sh << EOF
#!/bin/bash
export PGUSER='$REPMGR_USER'
export PGPASSWORD='$REPMGR_PASSWORD' 
export PGDATABASE='$REPMGR_DB'
export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin

# Try to find repmgr
repmgr_bin=\$(find /opt/bitnami -name repmgr -type f | head -1)
if [ -z \"\$repmgr_bin\" ]; then
    echo \"Could not find repmgr binary\"
    exit 1
fi

\$repmgr_bin -f /etc/repmgr.conf primary register --force
EOF
chmod +x /tmp/register_primary.sh"

# Run the script directly
run_in_pod $PRIMARY_POD "bash /tmp/register_primary.sh"

# Stop PostgreSQL on other nodes
for i in 0 1 2; do
  if [ $i -ne $NEW_PRIMARY ]; then
    STANDBY_POD="${STATEFULSET}-${i}"
    echo "Step 3: Stopping PostgreSQL on standby node ${i}..."
    run_in_pod $STANDBY_POD "/opt/bitnami/scripts/postgresql-repmgr/stop.sh"
    
    echo "Step 4: Cloning primary data to standby node ${i}..."
    
    # Create a temporary script for cloning the standby that doesn't rely on specific user
    run_in_pod $STANDBY_POD "cat > /tmp/clone_standby.sh << EOF
#!/bin/bash
export PGUSER='$REPMGR_USER'
export PGPASSWORD='$REPMGR_PASSWORD'
export PGDATABASE='$REPMGR_DB'
export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin

# Remove existing data
rm -rf /bitnami/postgresql/data/*

# Try to find repmgr
repmgr_bin=\$(find /opt/bitnami -name repmgr -type f | head -1)
if [ -z \"\$repmgr_bin\" ]; then
    echo \"Could not find repmgr binary\"
    exit 1
fi

\$repmgr_bin -h ${PRIMARY_POD}.${HEADLESS_SVC} -p 5432 standby clone --force
EOF
chmod +x /tmp/clone_standby.sh"

    # Run the clone script directly
    run_in_pod $STANDBY_POD "bash /tmp/clone_standby.sh"
    
    echo "Step 5: Starting PostgreSQL on standby node ${i}..."
    run_in_pod $STANDBY_POD "/opt/bitnami/scripts/postgresql-repmgr/start.sh"
    
    echo "Step 6: Registering node ${i} as standby..."
    
    # Create a temporary script for registering the standby
    run_in_pod $STANDBY_POD "cat > /tmp/register_standby.sh << EOF
#!/bin/bash
export PGUSER='$REPMGR_USER'
export PGPASSWORD='$REPMGR_PASSWORD'
export PGDATABASE='$REPMGR_DB'
export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin

# Try to find repmgr
repmgr_bin=\$(find /opt/bitnami -name repmgr -type f | head -1)
if [ -z \"\$repmgr_bin\" ]; then
    echo \"Could not find repmgr binary\"
    exit 1
fi

\$repmgr_bin -f /etc/repmgr.conf standby register --force
EOF
chmod +x /tmp/register_standby.sh"

    # Run the register script directly
    run_in_pod $STANDBY_POD "bash /tmp/register_standby.sh"
  fi
done

echo ""
echo "Step 7: Checking final cluster status..."

# Create a temporary script for checking cluster status
run_in_pod $PRIMARY_POD "cat > /tmp/cluster_status.sh << EOF
#!/bin/bash
export PGUSER='$REPMGR_USER'
export PGPASSWORD='$REPMGR_PASSWORD'
export PGDATABASE='$REPMGR_DB'
export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin

# Try to find repmgr
repmgr_bin=\$(find /opt/bitnami -name repmgr -type f | head -1)
if [ -z \"\$repmgr_bin\" ]; then
    echo \"Could not find repmgr binary\"
    exit 1
fi

\$repmgr_bin -f /etc/repmgr.conf cluster show
EOF
chmod +x /tmp/cluster_status.sh"

# Run the cluster status script directly
FINAL_STATUS=$(run_in_pod $PRIMARY_POD "bash /tmp/cluster_status.sh")
echo "$FINAL_STATUS"

# Clean up temporary scripts
for i in 0 1 2; do
  POD="${STATEFULSET}-${i}"
  run_in_pod $POD "rm -f /tmp/register_primary.sh /tmp/clone_standby.sh /tmp/register_standby.sh /tmp/cluster_status.sh" || true
done

echo ""
echo "Split-brain recovery completed."
echo "Your database backups are available at: $LOCAL_BACKUP_DIR"
echo "Please verify that the cluster is now in a consistent state."
Add Dockerfile and repmgr script for PostgreSQL replication management - Created a new Dockerfile for PostgreSQL using the Bitnami repmgr image. - Added a comprehensive `librepmgr.sh` script to manage PostgreSQL replication, including functions for node identification, password management, configuration validation, and primary/standby node operations. - Implemented environment variable validations and PostgreSQL configuration injections for replication settings. - Included functions for creating the repmgr user and database, as well as handling node registration and failover processes. Signed-off-by: zhenyus <zhenyus@mathmast.com> 2025-04-09 08:15:23 +00:00			`#!/bin/bash`
			`# filepath: repmgr-split-brain-recovery.sh`

			`set -e`

			`NAMESPACE="freeleaps-prod"`
			`STATEFULSET="freeleaps-prod-gitea-postgresql-ha-postgresql"`
			`HEADLESS_SVC="${STATEFULSET}-headless.${NAMESPACE}.svc.freeleaps.cluster"`
			`REPMGR_USER="repmgr"`
			`REPMGR_PASSWORD="WGZ47gbUTLvo"`
			`POSTGRES_PASSWORD="X9H2*9M2ZWYmuZ"`
			`REPMGR_DB="repmgr"`
			`POSTGRES_USER="postgres"`
			`BACKUP_DIR="/tmp/pg_backup_$(date +%Y%m%d_%H%M%S)"`
			`LOCAL_BACKUP_DIR="./pg_backups_$(date +%Y%m%d_%H%M%S)"`

			`echo "===== PostgreSQL Repmgr Split-Brain Recovery ====="`
			`echo "This script will attempt to fix the repmgr split-brain issue"`
			`echo ""`

			`# Create local backup directory`
			`mkdir -p $LOCAL_BACKUP_DIR`

			`# Function to run commands in a pod`
			`run_in_pod() {`
			`local pod=$1`
			`local cmd=$2`
			`kubectl exec -n $NAMESPACE $pod -- bash -c "$cmd"`
			`}`

			`# Function to get PostgreSQL WAL position`
			`get_wal_position() {`
			`local pod=$1`
			`run_in_pod $pod "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT pg_current_wal_lsn();\""`
			`}`

			`# Function to check if node is primary`
			`is_primary() {`
			`local pod=$1`
			`local result=$(run_in_pod $pod "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT pg_is_in_recovery();\"")`
			`if [[ $result == "f" ]]; then`
			`return 0 # is primary`
			`else`
			`return 1 # is standby`
			`fi`
			`}`

			`# Function to backup databases from a pod`
			`backup_databases() {`
			`local pod=$1`
			`local backup_path="$BACKUP_DIR/$pod"`

			`echo "Creating backup directory in the pod..."`
			`run_in_pod $pod "mkdir -p $backup_path"`

			`echo "Getting list of databases..."`
			`local databases=$(run_in_pod $pod "PGPASSWORD=$POSTGRES_PASSWORD psql -U $POSTGRES_USER -t -c \"SELECT datname FROM pg_database WHERE datname NOT IN ('template0', 'template1', 'postgres')\" \| tr -d ' '")`

			`echo "Backing up databases: $databases"`
			`for db in $databases; do`
			`echo "Backing up database: $db"`
			`run_in_pod $pod "PGPASSWORD=$POSTGRES_PASSWORD pg_dump -U $POSTGRES_USER -Fc $db > $backup_path/${db}.dump"`
			`done`

			`# Also backup global objects (roles, tablespaces)`
			`echo "Backing up global objects..."`
			`run_in_pod $pod "PGPASSWORD=$POSTGRES_PASSWORD pg_dumpall -U $POSTGRES_USER --globals-only > $backup_path/globals.sql"`

			`# Backup PostgreSQL configuration`
			`echo "Backing up PostgreSQL configuration..."`
			`run_in_pod $pod "cp /bitnami/postgresql/conf/postgresql.conf $backup_path/ 2>/dev/null \|\| true"`
			`run_in_pod $pod "cp /bitnami/postgresql/conf/pg_hba.conf $backup_path/ 2>/dev/null \|\| true"`

			`# Copy repmgr configuration`
			`echo "Backing up repmgr configuration..."`
			`run_in_pod $pod "cp /etc/repmgr.conf $backup_path/ 2>/dev/null \|\| true"`

			`# Tar the backup files`
			`echo "Creating archive of the backup..."`
			`run_in_pod $pod "tar -czf ${backup_path}.tar.gz -C $(dirname $backup_path) $(basename $backup_path)"`

			`# Copy backup to local machine`
			`echo "Copying backup to local machine..."`
			`kubectl cp $NAMESPACE/$pod:${backup_path}.tar.gz $LOCAL_BACKUP_DIR/${pod}_backup.tar.gz`

			`# Cleanup backup in the pod`
			`echo "Cleaning up backup files in the pod..."`
			`run_in_pod $pod "rm -rf $backup_path ${backup_path}.tar.gz"`
			`}`

			`echo "Step 0: Checking current status of the cluster..."`
			`for i in 0 1 2; do`
			`POD="${STATEFULSET}-${i}"`
			`echo -n "Node ${i} ($POD): "`

			`# Check if node is running as primary`
			`if is_primary $POD; then`
			`PRIMARY_STATE="running as primary"`
			`echo "$PRIMARY_STATE"`
			`else`
			`echo "running as standby"`
			`fi`

			`# Get WAL position`
			`WAL_POS=$(get_wal_position $POD 2>/dev/null \|\| echo "N/A")`
			`if [ "$WAL_POS" != "N/A" ]; then`
			`echo " - WAL position: $WAL_POS"`
			`# Store WAL positions for comparison`
			`declare "WAL_POS_${i}=$WAL_POS"`
			`fi`
			`done`

			`echo ""`
			`echo "Step 1: Backing up all databases from each node..."`
			`for i in 0 1 2; do`
			`POD="${STATEFULSET}-${i}"`
			`echo "Backing up data from node $i ($POD)..."`
			`backup_databases $POD`
			`done`

			`echo "All backups completed and stored in: $LOCAL_BACKUP_DIR"`
			`echo ""`

			`echo "Determining most advanced node based on WAL position..."`

			`# Get the primary nodes from each pod - there might be more than one in split-brain`
			`for i in 0 1 2; do`
			`POD="${STATEFULSET}-${i}"`
			`# Get node information`
			`NODE_INFO=$(run_in_pod $POD "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT node_id, node_name, type, active FROM repmgr.nodes WHERE node_name = '$POD';\"" 2>/dev/null \|\| echo "")`

			`if [ -n "$NODE_INFO" ]; then`
			`echo "Node ${i} info: $NODE_INFO"`

			`# Store if this node thinks it's a primary`
			`if [[ $NODE_INFO == "primary" ]]; then`
			`echo "Node ${i} is configured as a primary"`
			`declare "NODE_${i}_IS_PRIMARY=true"`
			`else`
			`declare "NODE_${i}_IS_PRIMARY=false"`
			`fi`

			`# Check if node is actually running as primary using pg_is_in_recovery()`
			`if is_primary $POD; then`
			`echo "Node ${i} is running as primary (pg_is_in_recovery=false)"`
			`declare "NODE_${i}_RUNNING_AS_PRIMARY=true"`
			`else`
			`declare "NODE_${i}_RUNNING_AS_PRIMARY=false"`
			`fi`
			`else`
			`echo "Could not get info for node ${i}"`
			`declare "NODE_${i}_IS_PRIMARY=false"`
			`declare "NODE_${i}_RUNNING_AS_PRIMARY=false"`
			`fi`
			`done`

			`echo ""`
			`echo "Analyzing WAL positions to determine the most advanced node..."`

			`# Compare WAL positions`
			`if [ -n "${WAL_POS_0}" ] && [ -n "${WAL_POS_1}" ] && [ -n "${WAL_POS_2}" ]; then`
			`# We have all WAL positions, find the most advanced`
			`if run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_0}' > '${WAL_POS_1}' AND '${WAL_POS_0}' > '${WAL_POS_2}';\"" \| grep -q 't'; then`
			`NEW_PRIMARY=0`
			`elif run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_1}' > '${WAL_POS_2}';\"" \| grep -q 't'; then`
			`NEW_PRIMARY=1`
			`else`
			`NEW_PRIMARY=2`
			`fi`
			`elif [ -n "${WAL_POS_0}" ] && [ -n "${WAL_POS_1}" ]; then`
			`# Only nodes 0 and 1 have WAL positions`
			`if run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_0}' > '${WAL_POS_1}';\"" \| grep -q 't'; then`
			`NEW_PRIMARY=0`
			`else`
			`NEW_PRIMARY=1`
			`fi`
			`elif [ -n "${WAL_POS_0}" ] && [ -n "${WAL_POS_2}" ]; then`
			`# Only nodes 0 and 2 have WAL positions`
			`if run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_0}' > '${WAL_POS_2}';\"" \| grep -q 't'; then`
			`NEW_PRIMARY=0`
			`else`
			`NEW_PRIMARY=2`
			`fi`
			`elif [ -n "${WAL_POS_1}" ] && [ -n "${WAL_POS_2}" ]; then`
			`# Only nodes 1 and 2 have WAL positions`
			`if run_in_pod ${STATEFULSET}-1 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_1}' > '${WAL_POS_2}';\"" \| grep -q 't'; then`
			`NEW_PRIMARY=1`
			`else`
			`NEW_PRIMARY=2`
			`fi`
			`elif [ -n "${WAL_POS_0}" ]; then`
			`NEW_PRIMARY=0`
			`elif [ -n "${WAL_POS_1}" ]; then`
			`NEW_PRIMARY=1`
			`elif [ -n "${WAL_POS_2}" ]; then`
			`NEW_PRIMARY=2`
			`else`
			`echo "Could not determine most advanced node. Using node 0 as default primary."`
			`NEW_PRIMARY=0`
			`fi`

			`echo "Selected node ${NEW_PRIMARY} as the new primary based on WAL position."`
			`# Fix the bad substitution by using proper indirection`
			`eval WAL_POS_VALUE=\$WAL_POS_${NEW_PRIMARY}`
			`if [ -n "$WAL_POS_VALUE" ]; then`
			`echo "WAL position: $WAL_POS_VALUE"`
			`fi`
			`echo ""`

			`# Confirm with user`
			`read -p "Backups completed. Do you want to proceed with fixing the split-brain issue? (y/n): " CONFIRM`
			`if [[ "$CONFIRM" != "y" ]]; then`
			`echo "Operation cancelled. Backups are still available at $LOCAL_BACKUP_DIR"`
			`exit 1`
			`fi`

			`echo ""`
			`echo "Step 2: Registering node ${NEW_PRIMARY} as primary..."`
			`PRIMARY_POD="${STATEFULSET}-${NEW_PRIMARY}"`

			`# Create a temporary script to run repmgr commands`
			`run_in_pod $PRIMARY_POD "cat > /tmp/register_primary.sh << EOF`
			`#!/bin/bash`
			`export PGUSER='$REPMGR_USER'`
			`export PGPASSWORD='$REPMGR_PASSWORD'`
			`export PGDATABASE='$REPMGR_DB'`
			`export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin`

			`# Try to find repmgr`
			`repmgr_bin=\$(find /opt/bitnami -name repmgr -type f \| head -1)`
			`if [ -z \"\$repmgr_bin\" ]; then`
			`echo \"Could not find repmgr binary\"`
			`exit 1`
			`fi`

			`\$repmgr_bin -f /etc/repmgr.conf primary register --force`
			`EOF`
			`chmod +x /tmp/register_primary.sh"`

			`# Run the script directly`
			`run_in_pod $PRIMARY_POD "bash /tmp/register_primary.sh"`

			`# Stop PostgreSQL on other nodes`
			`for i in 0 1 2; do`
			`if [ $i -ne $NEW_PRIMARY ]; then`
			`STANDBY_POD="${STATEFULSET}-${i}"`
			`echo "Step 3: Stopping PostgreSQL on standby node ${i}..."`
			`run_in_pod $STANDBY_POD "/opt/bitnami/scripts/postgresql-repmgr/stop.sh"`

			`echo "Step 4: Cloning primary data to standby node ${i}..."`

			`# Create a temporary script for cloning the standby that doesn't rely on specific user`
			`run_in_pod $STANDBY_POD "cat > /tmp/clone_standby.sh << EOF`
			`#!/bin/bash`
			`export PGUSER='$REPMGR_USER'`
			`export PGPASSWORD='$REPMGR_PASSWORD'`
			`export PGDATABASE='$REPMGR_DB'`
			`export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin`

			`# Remove existing data`
			`rm -rf /bitnami/postgresql/data/*`

			`# Try to find repmgr`
			`repmgr_bin=\$(find /opt/bitnami -name repmgr -type f \| head -1)`
			`if [ -z \"\$repmgr_bin\" ]; then`
			`echo \"Could not find repmgr binary\"`
			`exit 1`
			`fi`

			`\$repmgr_bin -h ${PRIMARY_POD}.${HEADLESS_SVC} -p 5432 standby clone --force`
			`EOF`
			`chmod +x /tmp/clone_standby.sh"`

			`# Run the clone script directly`
			`run_in_pod $STANDBY_POD "bash /tmp/clone_standby.sh"`

			`echo "Step 5: Starting PostgreSQL on standby node ${i}..."`
			`run_in_pod $STANDBY_POD "/opt/bitnami/scripts/postgresql-repmgr/start.sh"`

			`echo "Step 6: Registering node ${i} as standby..."`

			`# Create a temporary script for registering the standby`
			`run_in_pod $STANDBY_POD "cat > /tmp/register_standby.sh << EOF`
			`#!/bin/bash`
			`export PGUSER='$REPMGR_USER'`
			`export PGPASSWORD='$REPMGR_PASSWORD'`
			`export PGDATABASE='$REPMGR_DB'`
			`export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin`

			`# Try to find repmgr`
			`repmgr_bin=\$(find /opt/bitnami -name repmgr -type f \| head -1)`
			`if [ -z \"\$repmgr_bin\" ]; then`
			`echo \"Could not find repmgr binary\"`
			`exit 1`
			`fi`

			`\$repmgr_bin -f /etc/repmgr.conf standby register --force`
			`EOF`
			`chmod +x /tmp/register_standby.sh"`

			`# Run the register script directly`
			`run_in_pod $STANDBY_POD "bash /tmp/register_standby.sh"`
			`fi`
			`done`

			`echo ""`
			`echo "Step 7: Checking final cluster status..."`

			`# Create a temporary script for checking cluster status`
			`run_in_pod $PRIMARY_POD "cat > /tmp/cluster_status.sh << EOF`
			`#!/bin/bash`
			`export PGUSER='$REPMGR_USER'`
			`export PGPASSWORD='$REPMGR_PASSWORD'`
			`export PGDATABASE='$REPMGR_DB'`
			`export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin`

			`# Try to find repmgr`
			`repmgr_bin=\$(find /opt/bitnami -name repmgr -type f \| head -1)`
			`if [ -z \"\$repmgr_bin\" ]; then`
			`echo \"Could not find repmgr binary\"`
			`exit 1`
			`fi`

			`\$repmgr_bin -f /etc/repmgr.conf cluster show`
			`EOF`
			`chmod +x /tmp/cluster_status.sh"`

			`# Run the cluster status script directly`
			`FINAL_STATUS=$(run_in_pod $PRIMARY_POD "bash /tmp/cluster_status.sh")`
			`echo "$FINAL_STATUS"`

			`# Clean up temporary scripts`
			`for i in 0 1 2; do`
			`POD="${STATEFULSET}-${i}"`
			`run_in_pod $POD "rm -f /tmp/register_primary.sh /tmp/clone_standby.sh /tmp/register_standby.sh /tmp/cluster_status.sh" \|\| true`
			`done`

			`echo ""`
			`echo "Split-brain recovery completed."`
			`echo "Your database backups are available at: $LOCAL_BACKUP_DIR"`
			`echo "Please verify that the cluster is now in a consistent state."`