{
"formatVersion": 1,
"changeTime": "2023-07-26T12:41:50.048697008Z",
"cluster": {
"uid": "735e8856",
"generation": 1,
"changeTime": "2023-07-26T11:41:07.029419632Z",
"spec": {
"sleepInterval": "2s",
"requestTimeout": "4s",
"failInterval": "8s",
"synchronousReplication": true,
"minSynchronousStandbys": 1,
"maxSynchronousStandbys": 2,
"additionalWalSenders": null,
"additionalMasterReplicationSlots": null,
"usePgrewind": true,
"initMode": "new",
"pgParameters": {
"datestyle": "iso, mdy",
"default_text_search_config": "pg_catalog.english",
"dynamic_shared_memory_type": "posix",
"lc_messages": "en_US.UTF-8",
"lc_monetary": "en_US.UTF-8",
"lc_numeric": "en_US.UTF-8",
"lc_time": "en_US.UTF-8",
"log_min_duration_statement": "10",
"log_timezone": "UTC",
"logging_collector": "on",
"max_connections": "100",
"max_wal_size": "1GB",
"min_wal_size": "80MB",
"random_page_cost": "1.1",
"shared_buffers": "128MB",
"timezone": "UTC",
"wal_level": "replica",
"work_mem": "65536"
},
"pgHBA": null,
"automaticPgRestart": null
},
"status": {
"phase": "normal",
"master": "3397fb23"
}
},
"keepers": {
"node_4": {
"uid": "node_4",
"generation": 1,
"changeTime": "2023-07-26T12:41:50.048761009Z",
"spec": {},
"status": {
"healthy": true,
"lastHealthyTime": "2023-07-26T12:41:50.046055293Z",
"bootUUID": "e687f779-361c-4213-a5f4-157f352ed63a",
"postgresBinaryVersion": {
"Maj": 14,
"Min": 1
},
"canBeMaster": true,
"canBeSynchronousReplica": true
}
},
"node_5": {
"uid": "node_5",
"generation": 1,
"changeTime": "2023-07-26T12:41:50.048755779Z",
"spec": {},
"status": {
"healthy": true,
"lastHealthyTime": "2023-07-26T12:41:50.046055443Z",
"bootUUID": "9b604f18-c48a-42f4-a774-cd49795e0e89",
"postgresBinaryVersion": {
"Maj": 14,
"Min": 1
},
"canBeMaster": true,
"canBeSynchronousReplica": true
}
},
"node_6": {
"uid": "node_6",
"generation": 1,
"changeTime": "2023-07-24T14:10:46.921750869Z",
"spec": {},
"status": {
"lastHealthyTime": "2023-07-24T14:10:44.902643198Z",
"bootUUID": "2c4a6ad5-1ed8-4a55-aa25-0c30cb2a4acc",
"postgresBinaryVersion": {
"Maj": 14,
"Min": 1
},
"canBeMaster": true,
"canBeSynchronousReplica": true
}
}
},
"dbs": {
"32e06e90": {
"uid": "32e06e90",
"generation": 2,
"changeTime": "2023-07-26T11:41:11.057249694Z",
"spec": {
"keeperUID": "node_6",
"requestTimeout": "4s",
"maxStandbys": 20,
"usePgrewind": true,
"additionalWalSenders": 5,
"additionalReplicationSlots": null,
"initMode": "resync",
"pgParameters": {
"datestyle": "iso, mdy",
"default_text_search_config": "pg_catalog.english",
"dynamic_shared_memory_type": "posix",
"lc_messages": "en_US.UTF-8",
"lc_monetary": "en_US.UTF-8",
"lc_numeric": "en_US.UTF-8",
"lc_time": "en_US.UTF-8",
"log_min_duration_statement": "10",
"log_timezone": "UTC",
"logging_collector": "on",
"max_connections": "100",
"max_wal_size": "1GB",
"min_wal_size": "80MB",
"random_page_cost": "1.1",
"shared_buffers": "128MB",
"timezone": "UTC",
"wal_level": "replica",
"work_mem": "65536"
},
"pgHBA": null,
"role": "standby",
"followConfig": {
"type": "internal",
"dbuid": "3397fb23"
},
"followers": [],
"synchronousStandbys": null,
"externalSynchronousStandbys": null
},
"status": {
"synchronousStandbys": null
}
},
"3397fb23": {
"uid": "3397fb23",
"generation": 5,
"changeTime": "2023-07-26T12:41:50.048753905Z",
"spec": {
"keeperUID": "node_5",
"requestTimeout": "4s",
"maxStandbys": 20,
"synchronousReplication": true,
"usePgrewind": true,
"additionalWalSenders": 5,
"additionalReplicationSlots": null,
"initMode": "none",
"pgParameters": {
"datestyle": "iso, mdy",
"default_text_search_config": "pg_catalog.english",
"dynamic_shared_memory_type": "posix",
"lc_messages": "en_US.UTF-8",
"lc_monetary": "en_US.UTF-8",
"lc_numeric": "en_US.UTF-8",
"lc_time": "en_US.UTF-8",
"log_min_duration_statement": "10",
"log_timezone": "UTC",
"logging_collector": "on",
"max_connections": "100",
"max_wal_size": "1GB",
"min_wal_size": "80MB",
"random_page_cost": "1.1",
"shared_buffers": "128MB",
"timezone": "UTC",
"wal_level": "replica",
"work_mem": "65536"
},
"pgHBA": null,
"role": "master",
"followers": [
"32e06e90",
"c6cb2f35"
],
"synchronousStandbys": [
"c6cb2f35"
],
"externalSynchronousStandbys": []
},
"status": {
"healthy": true,
"currentGeneration": 5,
"listenAddress": "10.0.0.5",
"port": "5432",
"systemdID": "7191260299357525396",
"timelineID": 31,
"xLogPos": 411870652720,
"timelinesHistory": [
{
"timelineID": 29,
"switchPoint": 365271653264,
"reason": "no recovery target specified"
},
{
"timelineID": 30,
"switchPoint": 409000508648,
"reason": "no recovery target specified"
}
],
"pgParameters": {
"datestyle": "iso, mdy",
"default_text_search_config": "pg_catalog.english",
"dynamic_shared_memory_type": "posix",
"lc_messages": "en_US.UTF-8",
"lc_monetary": "en_US.UTF-8",
"lc_numeric": "en_US.UTF-8",
"lc_time": "en_US.UTF-8",
"log_min_duration_statement": "10",
"log_timezone": "UTC",
"logging_collector": "on",
"max_connections": "100",
"max_wal_size": "1GB",
"min_wal_size": "80MB",
"random_page_cost": "1.1",
"shared_buffers": "128MB",
"timezone": "UTC",
"wal_level": "replica",
"work_mem": "65536"
},
"synchronousStandbys": [
"c6cb2f35"
],
"olderWalFile": "0000001F0000005F000000D1"
}
},
"c6cb2f35": {
"uid": "c6cb2f35",
"generation": 2,
"changeTime": "2023-07-26T12:41:50.048725131Z",
"spec": {
"keeperUID": "node_4",
"requestTimeout": "4s",
"maxStandbys": 20,
"usePgrewind": true,
"additionalWalSenders": 5,
"additionalReplicationSlots": null,
"initMode": "none",
"pgParameters": {
"datestyle": "iso, mdy",
"default_text_search_config": "pg_catalog.english",
"dynamic_shared_memory_type": "posix",
"lc_messages": "en_US.UTF-8",
"lc_monetary": "en_US.UTF-8",
"lc_numeric": "en_US.UTF-8",
"lc_time": "en_US.UTF-8",
"log_min_duration_statement": "10",
"log_timezone": "UTC",
"logging_collector": "on",
"max_connections": "100",
"max_wal_size": "1GB",
"min_wal_size": "80MB",
"random_page_cost": "1.1",
"shared_buffers": "128MB",
"timezone": "UTC",
"wal_level": "replica",
"work_mem": "65536"
},
"pgHBA": null,
"role": "standby",
"followConfig": {
"type": "internal",
"dbuid": "3397fb23"
},
"followers": [],
"synchronousStandbys": null,
"externalSynchronousStandbys": null
},
"status": {
"healthy": true,
"currentGeneration": 2,
"listenAddress": "10.0.0.4",
"port": "5432",
"systemdID": "7191260299357525396",
"timelineID": 31,
"xLogPos": 411870652048,
"timelinesHistory": [
{
"timelineID": 29,
"switchPoint": 365271653264,
"reason": "no recovery target specified"
},
{
"timelineID": 30,
"switchPoint": 409000508648,
"reason": "no recovery target specified"
}
],
"pgParameters": {
"datestyle": "iso, mdy",
"default_text_search_config": "pg_catalog.english",
"dynamic_shared_memory_type": "posix",
"lc_messages": "en_US.UTF-8",
"lc_monetary": "en_US.UTF-8",
"lc_numeric": "en_US.UTF-8",
"lc_time": "en_US.UTF-8",
"log_min_duration_statement": "10",
"log_timezone": "UTC",
"logging_collector": "on",
"max_connections": "100",
"max_wal_size": "1GB",
"min_wal_size": "80MB",
"random_page_cost": "1.1",
"shared_buffers": "128MB",
"timezone": "UTC",
"wal_level": "replica",
"work_mem": "65536"
},
"synchronousStandbys": null,
"olderWalFile": "0000001F0000005F000000C5"
}
}
},
"proxy": {
"generation": 430,
"changeTime": "2023-07-26T11:41:13.06941116Z",
"spec": {
"masterDbUid": "3397fb23",
"enabledProxies": [
"26803fc5",
"295e41dc",
"62ae83e9",
"651751d3",
"c431e17d",
"c939976b",
"e0d69a43"
]
},
"status": {}
}
}
What happened:
Today we had a production outage of a couple minutes, suspectedly because our stolon cluster did not failover correctly.
I manually fixed the problem by restarting a keeper, but another keeper is still in an unhealthy state from the cluster's perspective, while the keeper itself seems to think that it is fine.
stolonctl statusshows keepernode_6asHEALTHY = false,PG HEALTHY = false, andPG LISTENADDRESS = (unknown):Click to expand full `stolonctl status` output
For reading convenience,
Keepersformatted nicely:Click to expand full `stolonctl spec` output
{ "sleepInterval": "2s", "requestTimeout": "4s", "failInterval": "8s", "synchronousReplication": true, "minSynchronousStandbys": 1, "maxSynchronousStandbys": 2, "usePgrewind": true, "initMode": "new", "pgParameters": { "datestyle": "iso, mdy", "default_text_search_config": "pg_catalog.english", "dynamic_shared_memory_type": "posix", "lc_messages": "en_US.UTF-8", "lc_monetary": "en_US.UTF-8", "lc_numeric": "en_US.UTF-8", "lc_time": "en_US.UTF-8", "log_min_duration_statement": "10", "log_timezone": "UTC", "logging_collector": "on", "max_connections": "100", "max_wal_size": "1GB", "min_wal_size": "80MB", "random_page_cost": "1.1", "shared_buffers": "128MB", "timezone": "UTC", "wal_level": "replica", "work_mem": "65536" } }Click to expand full `stolonctl clusterdata read | jq .` output
{ "formatVersion": 1, "changeTime": "2023-07-26T12:41:50.048697008Z", "cluster": { "uid": "735e8856", "generation": 1, "changeTime": "2023-07-26T11:41:07.029419632Z", "spec": { "sleepInterval": "2s", "requestTimeout": "4s", "failInterval": "8s", "synchronousReplication": true, "minSynchronousStandbys": 1, "maxSynchronousStandbys": 2, "additionalWalSenders": null, "additionalMasterReplicationSlots": null, "usePgrewind": true, "initMode": "new", "pgParameters": { "datestyle": "iso, mdy", "default_text_search_config": "pg_catalog.english", "dynamic_shared_memory_type": "posix", "lc_messages": "en_US.UTF-8", "lc_monetary": "en_US.UTF-8", "lc_numeric": "en_US.UTF-8", "lc_time": "en_US.UTF-8", "log_min_duration_statement": "10", "log_timezone": "UTC", "logging_collector": "on", "max_connections": "100", "max_wal_size": "1GB", "min_wal_size": "80MB", "random_page_cost": "1.1", "shared_buffers": "128MB", "timezone": "UTC", "wal_level": "replica", "work_mem": "65536" }, "pgHBA": null, "automaticPgRestart": null }, "status": { "phase": "normal", "master": "3397fb23" } }, "keepers": { "node_4": { "uid": "node_4", "generation": 1, "changeTime": "2023-07-26T12:41:50.048761009Z", "spec": {}, "status": { "healthy": true, "lastHealthyTime": "2023-07-26T12:41:50.046055293Z", "bootUUID": "e687f779-361c-4213-a5f4-157f352ed63a", "postgresBinaryVersion": { "Maj": 14, "Min": 1 }, "canBeMaster": true, "canBeSynchronousReplica": true } }, "node_5": { "uid": "node_5", "generation": 1, "changeTime": "2023-07-26T12:41:50.048755779Z", "spec": {}, "status": { "healthy": true, "lastHealthyTime": "2023-07-26T12:41:50.046055443Z", "bootUUID": "9b604f18-c48a-42f4-a774-cd49795e0e89", "postgresBinaryVersion": { "Maj": 14, "Min": 1 }, "canBeMaster": true, "canBeSynchronousReplica": true } }, "node_6": { "uid": "node_6", "generation": 1, "changeTime": "2023-07-24T14:10:46.921750869Z", "spec": {}, "status": { "lastHealthyTime": "2023-07-24T14:10:44.902643198Z", "bootUUID": "2c4a6ad5-1ed8-4a55-aa25-0c30cb2a4acc", "postgresBinaryVersion": { "Maj": 14, "Min": 1 }, "canBeMaster": true, "canBeSynchronousReplica": true } } }, "dbs": { "32e06e90": { "uid": "32e06e90", "generation": 2, "changeTime": "2023-07-26T11:41:11.057249694Z", "spec": { "keeperUID": "node_6", "requestTimeout": "4s", "maxStandbys": 20, "usePgrewind": true, "additionalWalSenders": 5, "additionalReplicationSlots": null, "initMode": "resync", "pgParameters": { "datestyle": "iso, mdy", "default_text_search_config": "pg_catalog.english", "dynamic_shared_memory_type": "posix", "lc_messages": "en_US.UTF-8", "lc_monetary": "en_US.UTF-8", "lc_numeric": "en_US.UTF-8", "lc_time": "en_US.UTF-8", "log_min_duration_statement": "10", "log_timezone": "UTC", "logging_collector": "on", "max_connections": "100", "max_wal_size": "1GB", "min_wal_size": "80MB", "random_page_cost": "1.1", "shared_buffers": "128MB", "timezone": "UTC", "wal_level": "replica", "work_mem": "65536" }, "pgHBA": null, "role": "standby", "followConfig": { "type": "internal", "dbuid": "3397fb23" }, "followers": [], "synchronousStandbys": null, "externalSynchronousStandbys": null }, "status": { "synchronousStandbys": null } }, "3397fb23": { "uid": "3397fb23", "generation": 5, "changeTime": "2023-07-26T12:41:50.048753905Z", "spec": { "keeperUID": "node_5", "requestTimeout": "4s", "maxStandbys": 20, "synchronousReplication": true, "usePgrewind": true, "additionalWalSenders": 5, "additionalReplicationSlots": null, "initMode": "none", "pgParameters": { "datestyle": "iso, mdy", "default_text_search_config": "pg_catalog.english", "dynamic_shared_memory_type": "posix", "lc_messages": "en_US.UTF-8", "lc_monetary": "en_US.UTF-8", "lc_numeric": "en_US.UTF-8", "lc_time": "en_US.UTF-8", "log_min_duration_statement": "10", "log_timezone": "UTC", "logging_collector": "on", "max_connections": "100", "max_wal_size": "1GB", "min_wal_size": "80MB", "random_page_cost": "1.1", "shared_buffers": "128MB", "timezone": "UTC", "wal_level": "replica", "work_mem": "65536" }, "pgHBA": null, "role": "master", "followers": [ "32e06e90", "c6cb2f35" ], "synchronousStandbys": [ "c6cb2f35" ], "externalSynchronousStandbys": [] }, "status": { "healthy": true, "currentGeneration": 5, "listenAddress": "10.0.0.5", "port": "5432", "systemdID": "7191260299357525396", "timelineID": 31, "xLogPos": 411870652720, "timelinesHistory": [ { "timelineID": 29, "switchPoint": 365271653264, "reason": "no recovery target specified" }, { "timelineID": 30, "switchPoint": 409000508648, "reason": "no recovery target specified" } ], "pgParameters": { "datestyle": "iso, mdy", "default_text_search_config": "pg_catalog.english", "dynamic_shared_memory_type": "posix", "lc_messages": "en_US.UTF-8", "lc_monetary": "en_US.UTF-8", "lc_numeric": "en_US.UTF-8", "lc_time": "en_US.UTF-8", "log_min_duration_statement": "10", "log_timezone": "UTC", "logging_collector": "on", "max_connections": "100", "max_wal_size": "1GB", "min_wal_size": "80MB", "random_page_cost": "1.1", "shared_buffers": "128MB", "timezone": "UTC", "wal_level": "replica", "work_mem": "65536" }, "synchronousStandbys": [ "c6cb2f35" ], "olderWalFile": "0000001F0000005F000000D1" } }, "c6cb2f35": { "uid": "c6cb2f35", "generation": 2, "changeTime": "2023-07-26T12:41:50.048725131Z", "spec": { "keeperUID": "node_4", "requestTimeout": "4s", "maxStandbys": 20, "usePgrewind": true, "additionalWalSenders": 5, "additionalReplicationSlots": null, "initMode": "none", "pgParameters": { "datestyle": "iso, mdy", "default_text_search_config": "pg_catalog.english", "dynamic_shared_memory_type": "posix", "lc_messages": "en_US.UTF-8", "lc_monetary": "en_US.UTF-8", "lc_numeric": "en_US.UTF-8", "lc_time": "en_US.UTF-8", "log_min_duration_statement": "10", "log_timezone": "UTC", "logging_collector": "on", "max_connections": "100", "max_wal_size": "1GB", "min_wal_size": "80MB", "random_page_cost": "1.1", "shared_buffers": "128MB", "timezone": "UTC", "wal_level": "replica", "work_mem": "65536" }, "pgHBA": null, "role": "standby", "followConfig": { "type": "internal", "dbuid": "3397fb23" }, "followers": [], "synchronousStandbys": null, "externalSynchronousStandbys": null }, "status": { "healthy": true, "currentGeneration": 2, "listenAddress": "10.0.0.4", "port": "5432", "systemdID": "7191260299357525396", "timelineID": 31, "xLogPos": 411870652048, "timelinesHistory": [ { "timelineID": 29, "switchPoint": 365271653264, "reason": "no recovery target specified" }, { "timelineID": 30, "switchPoint": 409000508648, "reason": "no recovery target specified" } ], "pgParameters": { "datestyle": "iso, mdy", "default_text_search_config": "pg_catalog.english", "dynamic_shared_memory_type": "posix", "lc_messages": "en_US.UTF-8", "lc_monetary": "en_US.UTF-8", "lc_numeric": "en_US.UTF-8", "lc_time": "en_US.UTF-8", "log_min_duration_statement": "10", "log_timezone": "UTC", "logging_collector": "on", "max_connections": "100", "max_wal_size": "1GB", "min_wal_size": "80MB", "random_page_cost": "1.1", "shared_buffers": "128MB", "timezone": "UTC", "wal_level": "replica", "work_mem": "65536" }, "synchronousStandbys": null, "olderWalFile": "0000001F0000005F000000C5" } } }, "proxy": { "generation": 430, "changeTime": "2023-07-26T11:41:13.06941116Z", "spec": { "masterDbUid": "3397fb23", "enabledProxies": [ "26803fc5", "295e41dc", "62ae83e9", "651751d3", "c431e17d", "c939976b", "e0d69a43" ] }, "status": {} } }Relevant here is
The
stolon-keeperseems to be running fine according tosystemctl status stolon-keeper.service:Details
Its logs show some earlier errors while
pg_rewindwas copying data over, but no indication that anything failed permanently:Relevant sections:
What you expected to happen:
stolonctl statusfinds that the keeper is up and working, orHow to reproduce it (as minimally and precisely as possible):
Unclear.
Stolon had been running uninterrupted for 2 months until this happened.
Environment:
mastercommit 4bb4107