summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Berg <[email protected]>2016-02-18 12:21:07 +0100
committerChristoph Berg <[email protected]>2016-02-18 12:21:07 +0100
commit04d6e40872e300b7b716988ad1a5e99b4efbf6b3 (patch)
tree09bb3652db0c1fc89a78ba989edb9d21d4df45ad
parent10952674ad721cbca5a9c21e042376cc6be37c51 (diff)
parent3ef43524d341ac27bd3a67ecb7fea3f087997d0d (diff)
Merge tag 'debian/3.0.3-2' into wheezy-backports-sloppy
repmgr Debian release 3.0.3-2
-rw-r--r--FAILOVER.rst7
-rw-r--r--FAQ.md23
-rw-r--r--HISTORY23
-rw-r--r--README.md62
-rw-r--r--TODO33
-rw-r--r--check_dir.c8
-rw-r--r--config.c423
-rw-r--r--config.h23
-rw-r--r--dbutils.c304
-rw-r--r--dbutils.h43
-rw-r--r--debian/changelog14
-rw-r--r--debian/control18
-rw-r--r--debian/control.in2
-rw-r--r--debian/patches/makefile-libpq-internal.patch24
-rw-r--r--debian/patches/makefile-no-libs.patch2
-rw-r--r--debian/patches/regress.patch4
-rw-r--r--debian/patches/series1
-rw-r--r--errcode.h1
-rw-r--r--log.c128
-rw-r--r--log.h12
-rw-r--r--repmgr.c578
-rw-r--r--repmgr.conf.sample35
-rw-r--r--repmgr.h25
-rw-r--r--repmgrd.c308
-rw-r--r--version.h2
25 files changed, 1469 insertions, 634 deletions
diff --git a/FAILOVER.rst b/FAILOVER.rst
index 4c39a08..7212863 100644
--- a/FAILOVER.rst
+++ b/FAILOVER.rst
@@ -93,7 +93,6 @@ Create the user and database to manage replication::
su - postgres
createuser -s repmgr
createdb -O repmgr repmgr
- psql -f /usr/share/postgresql/9.0/contrib/repmgr_funcs.sql repmgr
Restart the PostgreSQL server::
@@ -121,7 +120,7 @@ Log in to node2.
Clone node1 (the current Master)::
su - postgres
- repmgr -d repmgr -U repmgr -h node1 standby clone
+ repmgr -d repmgr -U repmgr -h node1 standby clone
Start the PostgreSQL server::
@@ -172,11 +171,13 @@ Register Master and Standby
Log in to node1.
-Register the node as Master::
+Register the node as master::
su - postgres
repmgr -f /etc/repmgr/repmgr.conf master register
+This will also create the repmgr schema and functions.
+
Log in to node2. Register it as a standby::
su - postgres
diff --git a/FAQ.md b/FAQ.md
index 6c8d6a9..eb9d4c2 100644
--- a/FAQ.md
+++ b/FAQ.md
@@ -34,6 +34,11 @@ General
replication slots, setting a higher figure will make adding new nodes
easier.
+- Does `repmgr` support hash indexes?
+
+ No. Hash indexes and replication do not mix well and their use is
+ explicitly discouraged; see:
+ http://www.postgresql.org/docs/current/interactive/sql-createindex.html#AEN74175
`repmgr`
--------
@@ -96,8 +101,9 @@ General
is intended to support running the witness server as a separate
instance on a normal node server, rather than on its own dedicated server.
- To specify a port for the witness server, supply the port number to
- repmgr with the `-l/--local-port` command line option.
+ To specify different port for the witness server, supply the port number
+ in the `conninfo` string in `repmgr.conf`
+ (repmgr 3.0.1 and earlier: use the `-l/--local-port` option)
- Do I need to include `shared_preload_libraries = 'repmgr_funcs'`
in `postgresql.conf` if I'm not using `repmgrd`?
@@ -106,6 +112,14 @@ General
If you later decide to run `repmgrd`, you just need to add
`shared_preload_libraries = 'repmgr_funcs'` and restart PostgreSQL.
+- I've provided replication permission for the `repmgr` user in `pg_hba.conf`
+ but `repmgr`/`repmgrd` complains it can't connect to the server... Why?
+
+ `repmgr`/`repmgrd` need to be able to connect to the repmgr database
+ with a normal connection to query metadata. The `replication` connection
+ permission is for PostgreSQL's streaming replication and doesn't
+ necessarily need to be the `repmgr` user.
+
`repmgrd`
---------
@@ -134,3 +148,8 @@ General
Note that after registering a delayed standby, `repmgrd` will only start
once the metadata added in the master node has been replicated.
+
+- How can I get `repmgrd` to rotate its logfile?
+
+ Configure your system's `logrotate` service to do this; see example
+ in README.md
diff --git a/HISTORY b/HISTORY
index a61e8cd..2311637 100644
--- a/HISTORY
+++ b/HISTORY
@@ -1,4 +1,21 @@
-3.0.2 2015-09-
+3.0.3 2016-01-04
+ Create replication slot if required before base backup is run (Abhijit)
+ standy clone: when using rsync, clean up "pg_replslot" directory (Ian)
+ Improve --help output (Ian)
+ Improve config file parsing (Ian)
+ Various logging output improvements, including explicit HINTS (Ian)
+ Add --log-level to explicitly set log level on command line (Ian)
+ Repurpose --verbose to display extra log output (Ian)
+ Add --terse to hide hints and other non-critical output (Ian)
+ Reference internal functions with explicit catalog path (Ian)
+ When following a new primary, have repmgr (not repmgrd) create the new slot (Ian)
+ Add /etc/repmgr.conf as a default configuration file location (Ian)
+ Prevent repmgrd's -v/--verbose option expecting a parameter (Ian)
+ Prevent invalid replication_lag values being written to the monitoring table (Ian)
+ Improve repmgrd behaviour when monitored standby node is temporarily
+ unavailable (Martín)
+
+3.0.2 2015-10-02
Improve handling of --help/--version options; and improve help output (Ian)
Improve handling of situation where logfile can't be opened (Ian)
Always pass -D/--pgdata option to pg_basebackup (Ian)
@@ -12,7 +29,9 @@
Update tablespace remapping in --rsync-only mode for 9.5 and later (Ian)
Deprecate `-l/--local-port` option - the port can be extracted
from the conninfo string in repmgr.conf (Ian)
- Add STANDBY UNREGISTE (Vik Fearing)
+ Add STANDBY UNREGISTER (Vik Fearing)
+ Don't fail with error when registering master if schema already defined (Ian)
+ Fixes to whitespace handling when parsing config file (Ian)
3.0.1 2015-04-16
Prevent repmgrd from looping infinitely if node was not registered (Ian)
diff --git a/README.md b/README.md
index b39e4cc..6ce9dcb 100644
--- a/README.md
+++ b/README.md
@@ -12,10 +12,13 @@ This version can use `pg_basebackup` to clone standby servers, supports
replication slots and cascading replication, doesn't require a restart
after promotion, and has many usability improvements.
-Please continue to use `repmgr 2` with earlier PostgreSQL 9.x versions.
+Please continue to use `repmgr 2` with PostgreSQL 9.2 and earlier.
For a list of changes since `repmgr 2` and instructions on upgrading to
`repmgr 3`, see the "Upgrading from repmgr 2" section below.
+For a list of frequently asked questions about `repmgr`, please refer
+to the file `FAQ.md`.
+
Overview
--------
@@ -111,7 +114,7 @@ like the following in `postgresql.conf`:
# How much WAL to retain on the primary to allow a temporarily
# disconnected standby to catch up again. The larger this is, the
# longer the standby can be disconnected. This is needed only in
- # 9.3; in 9.4, replication slots can be used instead (see below).
+ # 9.3; from 9.4, replication slots can be used instead (see below).
wal_keep_segments = 5000
@@ -141,10 +144,14 @@ running the following commands:
createuser -s repmgr
createdb repmgr -O repmgr
-We recommend using the name `repmgr` for both, but you can use whatever
-name you like (and you need to set the names you chose in the `conninfo`
-string in `repmgr.conf`; see below). `repmgr` will create the schema and
-objects it needs when it connects to the server.
+We recommend using the name `repmgr` for both user and database, but you
+can use whatever name you like (and you need to set the names you chose
+in the `conninfo` string in `repmgr.conf`; see below). We also recommend
+that you set the `repmgr` user's search path to include the `repmgr` schema
+for convenience when querying the metadata tables and views.
+
+The `repmgr` application will create its metadata schema in the `repmgr`
+database when the master server is registered.
### repmgr configuration
@@ -260,6 +267,20 @@ Example log output (at default log level):
[2015-03-11 13:15:40] [INFO] reloading configuration file and updating repmgr tables
[2015-03-11 13:15:40] [INFO] starting continuous standby node monitoring
+Note that currently `repmgrd` does not provide logfile rotation. To ensure
+the current logfile does not grow indefinitely, configure your system's `logrotate`
+to do this. Sample configuration to rotate logfiles weekly with retention
+for up to 52 weeks and rotation forced if a file grows beyond 100Mb:
+
+ /var/log/postgresql/repmgr-9.4.log {
+ missingok
+ compress
+ rotate 52
+ maxsize 100M
+ weekly
+ create 0600 postgres postgres
+ }
+
Witness server
--------------
@@ -355,6 +376,7 @@ Following event types currently exist:
standby_promote
witness_create
repmgrd_start
+ repmgrd_monitor
repmgrd_failover_promote
repmgrd_failover_follow
@@ -585,20 +607,20 @@ and one view:
`repmgr` or `repmgrd` will return one of the following error codes on program
exit:
-* SUCCESS (0) Program ran successfully.
-* ERR_BAD_CONFIG (1) Configuration file could not be parsed or was invalid
-* ERR_BAD_RSYNC (2) An rsync call made by the program returned an error
-* ERR_NO_RESTART (4) An attempt to restart a PostgreSQL instance failed
-* ERR_DB_CON (6) Error when trying to connect to a database
-* ERR_DB_QUERY (7) Error while executing a database query
-* ERR_PROMOTED (8) Exiting program because the node has been promoted to master
-* ERR_BAD_PASSWORD (9) Password used to connect to a database was rejected
-* ERR_STR_OVERFLOW (10) String overflow error
-* ERR_FAILOVER_FAIL (11) Error encountered during failover (repmgrd only)
-* ERR_BAD_SSH (12) Error when connecting to remote host via SSH
-* ERR_SYS_FAILURE (13) Error when forking (repmgrd only)
-* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup
-
+* SUCCESS (0) Program ran successfully.
+* ERR_BAD_CONFIG (1) Configuration file could not be parsed or was invalid
+* ERR_BAD_RSYNC (2) An rsync call made by the program returned an error
+* ERR_NO_RESTART (4) An attempt to restart a PostgreSQL instance failed
+* ERR_DB_CON (6) Error when trying to connect to a database
+* ERR_DB_QUERY (7) Error while executing a database query
+* ERR_PROMOTED (8) Exiting program because the node has been promoted to master
+* ERR_BAD_PASSWORD (9) Password used to connect to a database was rejected
+* ERR_STR_OVERFLOW (10) String overflow error
+* ERR_FAILOVER_FAIL (11) Error encountered during failover (repmgrd only)
+* ERR_BAD_SSH (12) Error when connecting to remote host via SSH
+* ERR_SYS_FAILURE (13) Error when forking (repmgrd only)
+* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup
+* ERR_MONITORING_FAIL (16) Unrecoverable error encountered during monitoring (repmgrd only)
Support and Assistance
----------------------
diff --git a/TODO b/TODO
index bb96665..3e377a3 100644
--- a/TODO
+++ b/TODO
@@ -7,6 +7,7 @@ Known issues in repmgr
* PGPASSFILE may not be passed to pg_basebackup
+
Planned feature improvements
============================
@@ -42,3 +43,35 @@ Planned feature improvements
* Have pg_basebackup use replication slots, if and when support for
this is added; see:
http://www.postgresql.org/message-id/[email protected]
+
+* use "primary/standby" terminology in place of "master/slave" for consistency
+ with main PostrgreSQL usage
+
+* repmgr standby clone: possibility to use barman instead of performing a new base backup
+
+* possibility to transform a failed master into a new standby with pg_rewind
+
+* "repmgr standby switchover" to promote a standby in a controlled manner
+ and convert the existing primary into a standby
+
+* make repmgrd more robust
+
+* repmgr: when cloning a standby using pg_basebackup and replication slots are
+ requested, activate the replication slot using pg_receivexlog to negate the
+ need to set `wal_keep_segments` just for the initial clone (9.4 and 9.5).
+
+Usability improvements
+======================
+
+* repmgr: add interrupt handler, so that if the program is interrupted
+ while running a backup, an attempt can be made to execute pg_stop_backup()
+ on the primary, to prevent an orphaned backup state existing.
+
+* repmgr: when unregistering a node, delete any entries in the repl_monitoring
+ table.
+
+* repmgr: for "standby unregister", accept connection parameters for the
+ primary and perform metadata updates (and slot removal) directly on
+ the primary, to allow a shutdown standby to be unregistered
+ (currently the standby must still be running, which means the replication
+ slot can't be dropped).
diff --git a/check_dir.c b/check_dir.c
index 6289079..adf446b 100644
--- a/check_dir.c
+++ b/check_dir.c
@@ -320,10 +320,10 @@ _create_pg_dir(char *dir, bool force, bool for_witness)
}
else if (pg_dir && !force)
{
- log_warning(_("\nThis looks like a PostgreSQL directory.\n"
- "If you are sure you want to clone here, "
- "please check there is no PostgreSQL server "
- "running and use the --force option\n"));
+ log_hint(_("This looks like a PostgreSQL directory.\n"
+ "If you are sure you want to clone here, "
+ "please check there is no PostgreSQL server "
+ "running and use the -F/--force option\n"));
return false;
}
diff --git a/config.c b/config.c
index 1b1f132..484c562 100644
--- a/config.c
+++ b/config.c
@@ -26,9 +26,25 @@
static void parse_event_notifications_list(t_configuration_options *options, const char *arg);
static void tablespace_list_append(t_configuration_options *options, const char *arg);
+static void exit_with_errors(ErrorList *config_errors);
+const static char *_progname = '\0';
static char config_file_path[MAXPGPATH];
static bool config_file_provided = false;
+static bool config_file_found = false;
+
+
+void
+set_progname(const char *argv0)
+{
+ _progname = get_progname(argv0);
+}
+
+const char *
+progname(void)
+{
+ return _progname;
+}
/*
* load_config()
@@ -40,61 +56,123 @@ static bool config_file_provided = false;
*
* Any configuration options changed in this function must also be changed in
* reload_config()
+ *
+ * NOTE: this function is called before the logger is set up, so we need
+ * to handle the verbose option ourselves; also the default log level is NOTICE,
+ * so we can't use DEBUG.
*/
bool
-load_config(const char *config_file, t_configuration_options *options, char *argv0)
+load_config(const char *config_file, bool verbose, t_configuration_options *options, char *argv0)
{
- struct stat config;
- /* Sanity checks */
+ struct stat stat_config;
/*
* If a configuration file was provided, check it exists, otherwise
- * emit an error and terminate
+ * emit an error and terminate. We assume that if a user explicitly
+ * provides a configuration file, they'll want to make sure it's
+ * used and not fall back to any of the defaults.
*/
if (config_file[0])
{
strncpy(config_file_path, config_file, MAXPGPATH);
canonicalize_path(config_file_path);
- if (stat(config_file_path, &config) != 0)
+ if (stat(config_file_path, &stat_config) != 0)
{
- log_err(_("provided configuration file '%s' not found: %s\n"),
+ log_err(_("provided configuration file \"%s\" not found: %s\n"),
config_file,
strerror(errno)
);
exit(ERR_BAD_CONFIG);
}
+ if (verbose == true)
+ {
+ log_notice(_("using configuration file \"%s\"\n"), config_file);
+ }
+
config_file_provided = true;
+ config_file_found = true;
}
/*
* If no configuration file was provided, attempt to find a default file
+ * in this order:
+ * - current directory
+ * - /etc/repmgr.conf
+ * - default sysconfdir
+ *
+ * here we just check for the existence of the file; parse_config()
+ * will handle read errors etc.
*/
if (config_file_provided == false)
{
char my_exec_path[MAXPGPATH];
- char etc_path[MAXPGPATH];
+ char sysconf_etc_path[MAXPGPATH];
+
+ /* 1. "./repmgr.conf" */
+ if (verbose == true)
+ {
+ log_notice(_("looking for configuration file in current directory\n"));
+ }
+
+ snprintf(config_file_path, MAXPGPATH, "./%s", CONFIG_FILE_NAME);
+ canonicalize_path(config_file_path);
+
+ if (stat(config_file_path, &stat_config) == 0)
+ {
+ config_file_found = true;
+ goto end_search;
+ }
+
+ /* 2. "/etc/repmgr.conf" */
+ if (verbose == true)
+ {
+ log_notice(_("looking for configuration file in /etc\n"));
+ }
+
+ snprintf(config_file_path, MAXPGPATH, "/etc/%s", CONFIG_FILE_NAME);
+ if (stat(config_file_path, &stat_config) == 0)
+ {
+ config_file_found = true;
+ goto end_search;
+ }
- /* First check if one is in the default sysconfdir */
+ /* 3. default sysconfdir */
if (find_my_exec(argv0, my_exec_path) < 0)
{
fprintf(stderr, _("%s: could not find own program executable\n"), argv0);
exit(EXIT_FAILURE);
}
- get_etc_path(my_exec_path, etc_path);
+ get_etc_path(my_exec_path, sysconf_etc_path);
- snprintf(config_file_path, MAXPGPATH, "%s/repmgr.conf", etc_path);
+ if (verbose == true)
+ {
+ log_notice(_("looking for configuration file in %s"), sysconf_etc_path);
+ }
- log_debug(_("Looking for configuration file in %s\n"), etc_path);
+ snprintf(config_file_path, MAXPGPATH, "%s/%s", sysconf_etc_path, CONFIG_FILE_NAME);
+ if (stat(config_file_path, &stat_config) == 0)
+ {
+ config_file_found = true;
+ goto end_search;
+ }
- if (stat(config_file_path, &config) != 0)
+ end_search:
+ if (config_file_found == true)
+ {
+ if (verbose == true)
+ {
+ log_notice(_("configuration file found at: %s\n"), config_file_path);
+ }
+ }
+ else
{
- /* Not found - default to ./repmgr.conf */
- strncpy(config_file_path, DEFAULT_CONFIG_FILE, MAXPGPATH);
- canonicalize_path(config_file_path);
- log_debug(_("Looking for configuration file in %s\n"), config_file_path);
+ if (verbose == true)
+ {
+ log_notice(_("no configuration file provided or found\n"));
+ }
}
}
@@ -102,12 +180,19 @@ load_config(const char *config_file, t_configuration_options *options, char *arg
}
+/*
+ * Parse configuration file; if any errors are encountered,
+ * list them and exit.
+ *
+ * Ensure any default values set here are synced with repmgr.conf.sample
+ * and any other documentation.
+ */
bool
parse_config(t_configuration_options *options)
{
FILE *fp;
char *s,
- buff[MAXLINELENGTH];
+ buf[MAXLINELENGTH];
char name[MAXLEN];
char value[MAXLEN];
@@ -115,36 +200,17 @@ parse_config(t_configuration_options *options)
PQconninfoOption *conninfo_options;
char *conninfo_errmsg = NULL;
- fp = fopen(config_file_path, "r");
+ /* Collate configuration file errors here for friendlier reporting */
+ static ErrorList config_errors = { NULL, NULL };
- /*
- * Since some commands don't require a config file at all, not having one
- * isn't necessarily a problem.
- *
- * If the user explictly provided a configuration file and we can't
- * read it we'll raise an error.
- *
- * If no configuration file was provided, we'll try and read the default\
- * file if it exists and is readable, but won't worry if it's not.
+ /* Initialize configuration options with sensible defaults
+ * note: the default log level is set in log.c and does not need
+ * to be initialised here
*/
- if (fp == NULL)
- {
- if (config_file_provided)
- {
- log_err(_("unable to open provided configuration file '%s'; terminating\n"), config_file_path);
- exit(ERR_BAD_CONFIG);
- }
-
- log_notice(_("no configuration file provided and default file '%s' not found - "
- "continuing with default values\n"),
- DEFAULT_CONFIG_FILE);
- return false;
- }
-
- /* Initialize configuration options with sensible defaults */
memset(options->cluster_name, 0, sizeof(options->cluster_name));
options->node = -1;
options->upstream_node = NO_UPSTREAM_NODE;
+ options->use_replication_slots = 0;
memset(options->conninfo, 0, sizeof(options->conninfo));
options->failover = MANUAL_FAILOVER;
options->priority = DEFAULT_PRIORITY;
@@ -162,7 +228,7 @@ parse_config(t_configuration_options *options)
/* default to 6 reconnection attempts at intervals of 10 seconds */
options->reconnect_attempts = 6;
- options->reconnect_intvl = 10;
+ options->reconnect_interval = 10;
options->monitor_interval_secs = 2;
options->retry_promote_interval_secs = 300;
@@ -172,15 +238,45 @@ parse_config(t_configuration_options *options)
options->tablespace_mapping.head = NULL;
options->tablespace_mapping.tail = NULL;
+ /*
+ * If no configuration file available (user didn't specify and none found
+ * in the default locations), return with default values
+ */
+ if (config_file_found == false)
+ {
+ log_notice(_("no configuration file provided and no default file found - "
+ "continuing with default values\n"));
+ return true;
+ }
+
+ fp = fopen(config_file_path, "r");
+
+ /*
+ * A configuration file has been found, either provided by the user
+ * or found in one of the default locations. If we can't open it,
+ * fail with an error.
+ */
+ if (fp == NULL)
+ {
+ if (config_file_provided)
+ {
+ log_err(_("unable to open provided configuration file \"%s\"; terminating\n"), config_file_path);
+ }
+ else
+ {
+ log_err(_("unable to open default configuration file \"%s\"; terminating\n"), config_file_path);
+ }
+ exit(ERR_BAD_CONFIG);
+ }
- /* Read next line */
- while ((s = fgets(buff, sizeof buff, fp)) != NULL)
+ /* Read file */
+ while ((s = fgets(buf, sizeof buf, fp)) != NULL)
{
bool known_parameter = true;
/* Parse name/value pair from line */
- parse_line(buff, name, value);
+ parse_line(buf, name, value);
/* Skip blank lines */
if (!strlen(name))
@@ -194,9 +290,9 @@ parse_config(t_configuration_options *options)
if (strcmp(name, "cluster") == 0)
strncpy(options->cluster_name, value, MAXLEN);
else if (strcmp(name, "node") == 0)
- options->node = atoi(value);
+ options->node = repmgr_atoi(value, "node", &config_errors);
else if (strcmp(name, "upstream_node") == 0)
- options->upstream_node = atoi(value);
+ options->upstream_node = repmgr_atoi(value, "upstream_node", &config_errors);
else if (strcmp(name, "conninfo") == 0)
strncpy(options->conninfo, value, MAXLEN);
else if (strcmp(name, "rsync_options") == 0)
@@ -223,12 +319,11 @@ parse_config(t_configuration_options *options)
}
else
{
- log_err(_("value for 'failover' must be 'automatic' or 'manual'\n"));
- exit(ERR_BAD_CONFIG);
+ error_list_append(&config_errors,_("value for 'failover' must be 'automatic' or 'manual'\n"));
}
}
else if (strcmp(name, "priority") == 0)
- options->priority = atoi(value);
+ options->priority = repmgr_atoi(value, "priority", &config_errors);
else if (strcmp(name, "node_name") == 0)
strncpy(options->node_name, value, MAXLEN);
else if (strcmp(name, "promote_command") == 0)
@@ -236,11 +331,16 @@ parse_config(t_configuration_options *options)
else if (strcmp(name, "follow_command") == 0)
strncpy(options->follow_command, value, MAXLEN);
else if (strcmp(name, "master_response_timeout") == 0)
- options->master_response_timeout = atoi(value);
+ options->master_response_timeout = repmgr_atoi(value, "master_response_timeout", &config_errors);
+ /* 'primary_response_timeout' as synonym for 'master_response_timeout' -
+ * we'll switch terminology in a future release (3.1?)
+ */
+ else if (strcmp(name, "primary_response_timeout") == 0)
+ options->master_response_timeout = repmgr_atoi(value, "primary_response_timeout", &config_errors);
else if (strcmp(name, "reconnect_attempts") == 0)
- options->reconnect_attempts = atoi(value);
+ options->reconnect_attempts = repmgr_atoi(value, "reconnect_attempts", &config_errors);
else if (strcmp(name, "reconnect_interval") == 0)
- options->reconnect_intvl = atoi(value);
+ options->reconnect_interval = repmgr_atoi(value, "reconnect_interval", &config_errors);
else if (strcmp(name, "pg_bindir") == 0)
strncpy(options->pg_bindir, value, MAXLEN);
else if (strcmp(name, "pg_ctl_options") == 0)
@@ -250,11 +350,12 @@ parse_config(t_configuration_options *options)
else if (strcmp(name, "logfile") == 0)
strncpy(options->logfile, value, MAXLEN);
else if (strcmp(name, "monitor_interval_secs") == 0)
- options->monitor_interval_secs = atoi(value);
+ options->monitor_interval_secs = repmgr_atoi(value, "monitor_interval_secs", &config_errors);
else if (strcmp(name, "retry_promote_interval_secs") == 0)
- options->retry_promote_interval_secs = atoi(value);
+ options->retry_promote_interval_secs = repmgr_atoi(value, "retry_promote_interval_secs", &config_errors);
else if (strcmp(name, "use_replication_slots") == 0)
- options->use_replication_slots = atoi(value);
+ /* XXX we should have a dedicated boolean argument format */
+ options->use_replication_slots = repmgr_atoi(value, "use_replication_slots", &config_errors);
else if (strcmp(name, "event_notification_command") == 0)
strncpy(options->event_notification_command, value, MAXLEN);
else if (strcmp(name, "event_notifications") == 0)
@@ -274,8 +375,13 @@ parse_config(t_configuration_options *options)
* as currently e.g. an empty `node` value will be converted to '0'.
*/
if (known_parameter == true && !strlen(value)) {
- log_err(_("no value provided for parameter '%s'\n"), name);
- exit(ERR_BAD_CONFIG);
+ char error_message_buf[MAXLEN] = "";
+ snprintf(error_message_buf,
+ MAXLEN,
+ _("no value provided for parameter \"%s\""),
+ name);
+
+ error_list_append(&config_errors, error_message_buf);
}
}
@@ -286,64 +392,49 @@ parse_config(t_configuration_options *options)
/* The following checks are for the presence of the parameter */
if (*options->cluster_name == '\0')
{
- log_err(_("required parameter 'cluster' was not found\n"));
- exit(ERR_BAD_CONFIG);
+ error_list_append(&config_errors, _("\"cluster\": parameter was not found\n"));
}
if (options->node == -1)
{
- log_err(_("required parameter 'node' was not found\n"));
- exit(ERR_BAD_CONFIG);
- }
-
- if (options->node == 0)
- {
- log_err(_("'node' must be an integer greater than zero\n"));
- exit(ERR_BAD_CONFIG);
+ error_list_append(&config_errors, _("\"node\": parameter was not found\n"));
}
if (*options->node_name == '\0')
{
- log_err(_("required parameter 'node_name' was not found\n"));
- exit(ERR_BAD_CONFIG);
+ error_list_append(&config_errors, _("\"node_name\": parameter was not found\n"));
}
if (*options->conninfo == '\0')
{
- log_err(_("required parameter 'conninfo' was not found\n"));
- exit(ERR_BAD_CONFIG);
+ error_list_append(&config_errors, _("\"conninfo\": parameter was not found\n"));
}
-
- /* Sanity check the provided conninfo string
- *
- * NOTE: this verifies the string format and checks for valid options
- * but does not sanity check values
- */
- conninfo_options = PQconninfoParse(options->conninfo, &conninfo_errmsg);
- if (conninfo_options == NULL)
+ else
{
- log_err(_("Parameter 'conninfo' is invalid: %s"), conninfo_errmsg);
- exit(ERR_BAD_CONFIG);
- }
- PQconninfoFree(conninfo_options);
- /* The following checks are for valid parameter values */
- if (options->master_response_timeout <= 0)
- {
- log_err(_("'master_response_timeout' must be greater than zero\n"));
- exit(ERR_BAD_CONFIG);
- }
+ /* Sanity check the provided conninfo string
+ *
+ * NOTE: PQconninfoParse() verifies the string format and checks for valid options
+ * but does not sanity check values
+ */
+ conninfo_options = PQconninfoParse(options->conninfo, &conninfo_errmsg);
+ if (conninfo_options == NULL)
+ {
+ char error_message_buf[MAXLEN] = "";
+ snprintf(error_message_buf,
+ MAXLEN,
+ _("\"conninfo\": %s"),
+ conninfo_errmsg);
- if (options->reconnect_attempts < 0)
- {
- log_err(_("'reconnect_attempts' must be zero or greater\n"));
- exit(ERR_BAD_CONFIG);
+ error_list_append(&config_errors, error_message_buf);
+ }
+
+ PQconninfoFree(conninfo_options);
}
- if (options->reconnect_intvl < 0)
+ if (config_errors.head != NULL)
{
- log_err(_("'reconnect_interval' must be zero or greater\n"));
- exit(ERR_BAD_CONFIG);
+ exit_with_errors(&config_errors);
}
return true;
@@ -378,7 +469,7 @@ trim(char *s)
}
void
-parse_line(char *buff, char *name, char *value)
+parse_line(char *buf, char *name, char *value)
{
int i = 0;
int j = 0;
@@ -389,10 +480,10 @@ parse_line(char *buff, char *name, char *value)
for (; i < MAXLEN; ++i)
{
- if (buff[i] == '=')
+ if (buf[i] == '=')
break;
- switch(buff[i])
+ switch(buf[i])
{
/* Ignore whitespace */
case ' ':
@@ -401,7 +492,7 @@ parse_line(char *buff, char *name, char *value)
case '\t':
continue;
default:
- name[j++] = buff[i];
+ name[j++] = buf[i];
}
}
name[j] = '\0';
@@ -411,9 +502,9 @@ parse_line(char *buff, char *name, char *value)
*/
for (; i < MAXLEN; ++i)
{
- if (buff[i+1] == ' ')
+ if (buf[i+1] == ' ')
continue;
- if (buff[i+1] == '\t')
+ if (buf[i+1] == '\t')
continue;
break;
@@ -424,12 +515,12 @@ parse_line(char *buff, char *name, char *value)
*/
j = 0;
for (++i; i < MAXLEN; ++i)
- if (buff[i] == '\'')
+ if (buf[i] == '\'')
continue;
- else if (buff[i] == '#')
+ else if (buf[i] == '#')
break;
- else if (buff[i] != '\n')
- value[j++] = buff[i];
+ else if (buf[i] != '\n')
+ value[j++] = buf[i];
else
break;
value[j] = '\0';
@@ -491,7 +582,7 @@ reload_config(t_configuration_options *orig_options)
return false;
}
- if (new_options.reconnect_intvl < 0)
+ if (new_options.reconnect_interval < 0)
{
log_warning(_("new value for 'reconnect_interval' must be zero or greater\n"));
return false;
@@ -610,10 +701,10 @@ reload_config(t_configuration_options *orig_options)
config_changed = true;
}
- /* reconnect_intvl */
- if (orig_options->reconnect_intvl != new_options.reconnect_intvl)
+ /* reconnect_interval */
+ if (orig_options->reconnect_interval != new_options.reconnect_interval)
{
- orig_options->reconnect_intvl = new_options.reconnect_intvl;
+ orig_options->reconnect_interval = new_options.reconnect_interval;
config_changed = true;
}
@@ -665,6 +756,96 @@ reload_config(t_configuration_options *orig_options)
}
+void
+error_list_append(ErrorList *error_list, char *error_message)
+{
+ ErrorListCell *cell;
+
+ cell = (ErrorListCell *) pg_malloc0(sizeof(ErrorListCell));
+
+ if (cell == NULL)
+ {
+ log_err(_("unable to allocate memory; terminating.\n"));
+ exit(ERR_BAD_CONFIG);
+ }
+
+ cell->error_message = pg_malloc0(MAXLEN);
+ strncpy(cell->error_message, error_message, MAXLEN);
+
+ if (error_list->tail)
+ {
+ error_list->tail->next = cell;
+ }
+ else
+ {
+ error_list->head = cell;
+ }
+
+ error_list->tail = cell;
+}
+
+
+/*
+ * Convert provided string to an integer using strtol;
+ * on error, if a callback is provided, pass the error message to that,
+ * otherwise exit
+ */
+int
+repmgr_atoi(const char *value, const char *config_item, ErrorList *error_list)
+{
+ char *endptr;
+ long longval = 0;
+ char error_message_buf[MAXLEN] = "";
+
+ /* It's possible that some versions of strtol() don't treat an empty
+ * string as an error.
+ */
+
+ if (*value == '\0')
+ {
+ snprintf(error_message_buf,
+ MAXLEN,
+ _("no value provided for \"%s\""),
+ config_item);
+ }
+ else
+ {
+ errno = 0;
+ longval = strtol(value, &endptr, 10);
+
+ if (value == endptr || errno)
+ {
+ snprintf(error_message_buf,
+ MAXLEN,
+ _("\"%s\": invalid value (provided: \"%s\")"),
+ config_item, value);
+ }
+ }
+
+ /* Currently there are no values which could be negative */
+ if (longval < 0)
+ {
+ snprintf(error_message_buf,
+ MAXLEN,
+ _("\"%s\" must be zero or greater (provided: %s)"),
+ config_item, value);
+ }
+
+ /* Error message buffer is set */
+ if (error_message_buf[0] != '\0')
+ {
+ if (error_list == NULL)
+ {
+ log_err("%s\n", error_message_buf);
+ exit(ERR_BAD_CONFIG);
+ }
+
+ error_list_append(error_list, error_message_buf);
+ }
+
+ return (int32) longval;
+}
+
/*
* Split argument into old_dir and new_dir and append to tablespace mapping
@@ -797,3 +978,21 @@ parse_event_notifications_list(t_configuration_options *options, const char *arg
}
}
}
+
+
+
+static void
+exit_with_errors(ErrorList *config_errors)
+{
+ ErrorListCell *cell;
+
+ log_err(_("%s: following errors were found in the configuration file.\n"), progname());
+
+ for (cell = config_errors->head; cell; cell = cell->next)
+ {
+ log_err("%s\n", cell->error_message);
+ }
+
+ exit(ERR_BAD_CONFIG);
+}
+
diff --git a/config.h b/config.h
index 5731d14..4307802 100644
--- a/config.h
+++ b/config.h
@@ -24,6 +24,7 @@
#include "strutil.h"
+#define CONFIG_FILE_NAME "repmgr.conf"
typedef struct EventNotificationListCell
{
@@ -67,7 +68,7 @@ typedef struct
char ssh_options[QUERY_STR_LEN];
int master_response_timeout;
int reconnect_attempts;
- int reconnect_intvl;
+ int reconnect_interval;
char pg_bindir[MAXLEN];
char pg_ctl_options[MAXLEN];
char pg_basebackup_options[MAXLEN];
@@ -82,11 +83,29 @@ typedef struct
#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", 0, 0, 0, "", { NULL, NULL }, {NULL, NULL} }
+typedef struct ErrorListCell
+{
+ struct ErrorListCell *next;
+ char *error_message;
+} ErrorListCell;
+
+typedef struct ErrorList
+{
+ ErrorListCell *head;
+ ErrorListCell *tail;
+} ErrorList;
+
+void set_progname(const char *argv0);
+const char * progname(void);
-bool load_config(const char *config_file, t_configuration_options *options, char *argv0);
+bool load_config(const char *config_file, bool verbose, t_configuration_options *options, char *argv0);
bool reload_config(t_configuration_options *orig_options);
bool parse_config(t_configuration_options *options);
void parse_line(char *buff, char *name, char *value);
char *trim(char *s);
+void error_list_append(ErrorList *error_list, char *error_message);
+int repmgr_atoi(const char *s,
+ const char *config_item,
+ ErrorList *error_list);
#endif
diff --git a/dbutils.c b/dbutils.c
index 03edf0b..b9f8b99 100644
--- a/dbutils.c
+++ b/dbutils.c
@@ -87,6 +87,8 @@ begin_transaction(PGconn *conn)
{
PGresult *res;
+ log_verbose(LOG_DEBUG, "begin_transaction()\n");
+
res = PQexec(conn, "BEGIN");
if (PQresultStatus(res) != PGRES_COMMAND_OK)
@@ -109,6 +111,8 @@ commit_transaction(PGconn *conn)
{
PGresult *res;
+ log_verbose(LOG_DEBUG, "commit_transaction()\n");
+
res = PQexec(conn, "COMMIT");
if (PQresultStatus(res) != PGRES_COMMAND_OK)
@@ -131,6 +135,8 @@ rollback_transaction(PGconn *conn)
{
PGresult *res;
+ log_verbose(LOG_DEBUG, "rollback_transaction()\n");
+
res = PQexec(conn, "ROLLBACK");
if (PQresultStatus(res) != PGRES_COMMAND_OK)
@@ -158,7 +164,8 @@ check_cluster_schema(PGconn *conn)
"SELECT 1 FROM pg_namespace WHERE nspname = '%s'",
get_repmgr_schema());
- log_debug(_("check_cluster_schema(): %s\n"), sqlquery);
+ log_verbose(LOG_DEBUG, "check_cluster_schema(): %s\n", sqlquery);
+
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
@@ -188,17 +195,22 @@ is_standby(PGconn *conn)
{
PGresult *res;
int result = 0;
+ char *sqlquery = "SELECT pg_catalog.pg_is_in_recovery()";
- res = PQexec(conn, "SELECT pg_is_in_recovery()");
+ log_verbose(LOG_DEBUG, "is_standby(): %s\n", sqlquery);
+
+ res = PQexec(conn, sqlquery);
if (res == NULL || PQresultStatus(res) != PGRES_TUPLES_OK)
{
- log_err(_("Can't query server mode: %s"),
+ log_err(_("Unable to query server mode: %s\n"),
PQerrorMessage(conn));
result = -1;
}
else if (PQntuples(res) == 1 && strcmp(PQgetvalue(res, 0, 0), "t") == 0)
+ {
result = 1;
+ }
PQclear(res);
return result;
@@ -285,6 +297,8 @@ get_master_node_id(PGconn *conn, char *cluster)
get_repmgr_schema_quoted(conn),
cluster);
+ log_verbose(LOG_DEBUG, "get_master_node_id():\n%s\n", sqlquery);
+
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
@@ -341,14 +355,17 @@ guc_set(PGconn *conn, const char *parameter, const char *op,
char sqlquery[QUERY_STR_LEN];
int retval = 1;
- sqlquery_snprintf(sqlquery, "SELECT true FROM pg_settings "
+ sqlquery_snprintf(sqlquery,
+ "SELECT true FROM pg_settings "
" WHERE name = '%s' AND setting %s '%s'",
parameter, op, value);
+ log_verbose(LOG_DEBUG, "guc_set():\n%s\n", sqlquery);
+
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
- log_err(_("GUC setting check PQexec failed: %s"),
+ log_err(_("guc_set(): unable to execute query\n%s\n"),
PQerrorMessage(conn));
retval = -1;
}
@@ -379,10 +396,12 @@ guc_set_typed(PGconn *conn, const char *parameter, const char *op,
" WHERE name = '%s' AND setting::%s %s '%s'::%s",
parameter, datatype, op, value, datatype);
+ log_verbose(LOG_DEBUG, "guc_set_typed():n%s\n", sqlquery);
+
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
- log_err(_("GUC setting check PQexec failed: %s"),
+ log_err(_("guc_set_typed(): unable to execute query\n%s\n"),
PQerrorMessage(conn));
retval = -1;
}
@@ -403,15 +422,16 @@ get_cluster_size(PGconn *conn, char *size)
PGresult *res;
char sqlquery[QUERY_STR_LEN];
- sqlquery_snprintf(
- sqlquery,
- "SELECT pg_size_pretty(SUM(pg_database_size(oid))::bigint) "
+ sqlquery_snprintf(sqlquery,
+ "SELECT pg_catalog.pg_size_pretty(SUM(pg_catalog.pg_database_size(oid))::bigint) "
" FROM pg_database ");
+ log_verbose(LOG_DEBUG, "get_cluster_size():\n%s\n", sqlquery);
+
res = PQexec(conn, sqlquery);
if (res == NULL || PQresultStatus(res) != PGRES_TUPLES_OK)
{
- log_err(_("get_cluster_size(): PQexec failed: %s"),
+ log_err(_("get_cluster_size(): unable to execute query\n%s\n"),
PQerrorMessage(conn));
PQclear(res);
@@ -439,7 +459,7 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
" FROM pg_settings WHERE name = '%s'",
setting);
- log_debug(_("get_pg_setting(): %s\n"), sqlquery);
+ log_verbose(LOG_DEBUG, "get_pg_setting(): %s\n", sqlquery);
res = PQexec(conn, sqlquery);
@@ -461,13 +481,14 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
}
else
{
- log_err(_("unknown parameter: %s"), PQgetvalue(res, i, 0));
+ /* XXX highly unlikely this would ever happen */
+ log_err(_("get_pg_setting(): unknown parameter \"%s\""), PQgetvalue(res, i, 0));
}
}
if (success == true)
{
- log_debug(_("get_pg_setting(): returned value is '%s'\n"), output);
+ log_debug(_("get_pg_setting(): returned value is \"%s\"\n"), output);
}
PQclear(res);
@@ -512,13 +533,13 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
cluster,
node_id);
- log_debug("get_upstream_connection(): %s\n", sqlquery);
+ log_verbose(LOG_DEBUG, "get_upstream_connection():\n%s\n", sqlquery);
res = PQexec(standby_conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
- log_err(_("unable to get conninfo for upstream server: %s\n"),
+ log_err(_("unable to get conninfo for upstream server\n%s\n"),
PQerrorMessage(standby_conn));
PQclear(res);
return NULL;
@@ -538,7 +559,7 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
PQclear(res);
- log_debug("conninfo is: '%s'\n", upstream_conninfo);
+ log_verbose(LOG_DEBUG, "get_upstream_connection(): conninfo is \"%s\"\n", upstream_conninfo);
upstream_conn = establish_db_connection(upstream_conninfo, false);
if (PQstatus(upstream_conn) != CONNECTION_OK)
@@ -553,24 +574,26 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
/*
- * get a connection to master by reading repl_nodes, creating a connection
- * to each node (one at a time) and finding if it is a master or a standby
+ * Read the node list from the local node and attempt to connect to each node
+ * in turn to definitely establish if it's the cluster primary.
+ *
+ * The node list is returned in the order which makes it likely that the
+ * current primary will be returned first, reducing the number of speculative
+ * connections which need to be made to other nodes.
*
- * NB: If master_conninfo_out may be NULL. If it is non-null, it is assumed to
- * point to allocated memory of MAXCONNINFO in length, and the master server
- * connection string is placed there.
+ * If master_conninfo_out points to allocated memory of MAXCONNINFO in length,
+ * the primary server's conninfo string will be copied there.
*/
PGconn *
get_master_connection(PGconn *standby_conn, char *cluster,
int *master_id, char *master_conninfo_out)
{
- PGconn *master_conn = NULL;
- PGresult *res1;
- PGresult *res2;
+ PGconn *remote_conn = NULL;
+ PGresult *res;
char sqlquery[QUERY_STR_LEN];
- char master_conninfo_stack[MAXCONNINFO];
- char *master_conninfo = &*master_conninfo_stack;
+ char remote_conninfo_stack[MAXCONNINFO];
+ char *remote_conninfo = &*remote_conninfo_stack;
int i,
node_id;
@@ -581,59 +604,60 @@ get_master_connection(PGconn *standby_conn, char *cluster,
}
/* find all nodes belonging to this cluster */
- log_info(_("finding node list for cluster '%s'\n"),
+ log_info(_("retrieving node list for cluster '%s'\n"),
cluster);
sqlquery_snprintf(sqlquery,
- "SELECT id, conninfo "
- " FROM %s.repl_nodes "
- " WHERE cluster = '%s' "
- " AND type != 'witness' ",
+ " SELECT id, conninfo, "
+ " CASE WHEN type = 'master' THEN 1 ELSE 2 END AS type_priority"
+ " FROM %s.repl_nodes "
+ " WHERE cluster = '%s' "
+ " AND type != 'witness' "
+ "ORDER BY active DESC, type_priority, priority, id",
get_repmgr_schema_quoted(standby_conn),
cluster);
- res1 = PQexec(standby_conn, sqlquery);
- if (PQresultStatus(res1) != PGRES_TUPLES_OK)
+ log_verbose(LOG_DEBUG, "get_master_connection():\n%s\n", sqlquery);
+
+ res = PQexec(standby_conn, sqlquery);
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_err(_("unable to retrieve node records: %s\n"),
PQerrorMessage(standby_conn));
- PQclear(res1);
+ PQclear(res);
return NULL;
}
- for (i = 0; i < PQntuples(res1); i++)
+ for (i = 0; i < PQntuples(res); i++)
{
- /* initialize with the values of the current node being processed */
- node_id = atoi(PQgetvalue(res1, i, 0));
- strncpy(master_conninfo, PQgetvalue(res1, i, 1), MAXCONNINFO);
- log_info(_("checking role of cluster node '%i'\n"),
- node_id);
- master_conn = establish_db_connection(master_conninfo, false);
+ int is_node_standby;
- if (PQstatus(master_conn) != CONNECTION_OK)
+ /* initialize with the values of the current node being processed */
+ node_id = atoi(PQgetvalue(res, i, 0));
+ strncpy(remote_conninfo, PQgetvalue(res, i, 1), MAXCONNINFO);
+ log_verbose(LOG_INFO,
+ _("checking role of cluster node '%i'\n"),
+ node_id);
+ remote_conn = establish_db_connection(remote_conninfo, false);
+
+ if (PQstatus(remote_conn) != CONNECTION_OK)
continue;
- /*
- * Can't use the is_standby() function here because on error that
- * function closes the connection passed and exits. This still needs
- * to close master_conn first.
- */
- res2 = PQexec(master_conn, "SELECT pg_is_in_recovery()");
+ is_node_standby = is_standby(remote_conn);
- if (PQresultStatus(res2) != PGRES_TUPLES_OK)
+ if (is_node_standby == -1)
{
- log_err(_("unable to retrieve recovery state from this node: %s\n"),
- PQerrorMessage(master_conn));
- PQclear(res2);
- PQfinish(master_conn);
+ log_err(_("unable to retrieve recovery state from node %i:\n%s\n"),
+ node_id,
+ PQerrorMessage(remote_conn));
+ PQfinish(remote_conn);
continue;
}
- /* if false, this is the master */
- if (strcmp(PQgetvalue(res2, 0, 0), "f") == 0)
+ /* if is_standby() returns 0, queried node is the master */
+ if (is_node_standby == 0)
{
- PQclear(res2);
- PQclear(res1);
+ PQclear(res);
log_debug(_("get_master_connection(): current master node is %i\n"), node_id);
if (master_id != NULL)
@@ -641,14 +665,12 @@ get_master_connection(PGconn *standby_conn, char *cluster,
*master_id = node_id;
}
- return master_conn;
- }
- else
- {
- /* if it is a standby, clear info */
- PQclear(res2);
- PQfinish(master_conn);
+ return remote_conn;
}
+
+
+ /* if it is a standby, clear connection info and continue*/
+ PQfinish(remote_conn);
}
/*
@@ -659,7 +681,7 @@ get_master_connection(PGconn *standby_conn, char *cluster,
* Probably we will need to check the error to know if we need to start
* failover procedure or just fix some situation on the standby.
*/
- PQclear(res1);
+ PQclear(res);
return NULL;
}
@@ -687,7 +709,7 @@ wait_connection_availability(PGconn *conn, long long timeout)
{
if (PQconsumeInput(conn) == 0)
{
- log_warning(_("wait_connection_availability: could not receive data from connection. %s\n"),
+ log_warning(_("wait_connection_availability(): could not receive data from connection. %s\n"),
PQerrorMessage(conn));
return 0;
}
@@ -714,7 +736,7 @@ wait_connection_availability(PGconn *conn, long long timeout)
if (select(sock, &read_set, NULL, NULL, &tmout) == -1)
{
log_warning(
- _("wait_connection_availability: select() returned with error: %s"),
+ _("wait_connection_availability(): select() returned with error\n%s\n"),
strerror(errno));
return -1;
}
@@ -730,7 +752,7 @@ wait_connection_availability(PGconn *conn, long long timeout)
return 1;
}
- log_warning(_("wait_connection_availability: timeout reached"));
+ log_warning(_("wait_connection_availability(): timeout reached"));
return -1;
}
@@ -765,6 +787,12 @@ cancel_query(PGconn *conn, int timeout)
return true;
}
+
+/* Return the repmgr schema as an unmodified string
+ * This is useful for displaying the schema name in log messages,
+ * however inclusion in SQL statements, get_repmgr_schema_quoted() should
+ * always be used.
+ */
char *
get_repmgr_schema(void)
{
@@ -806,6 +834,8 @@ create_replication_slot(PGconn *conn, char *slot_name)
" WHERE slot_name = '%s' ",
slot_name);
+ log_verbose(LOG_DEBUG, "create_replication_slot():\n%s\n", sqlquery);
+
res = PQexec(conn, sqlquery);
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
{
@@ -826,8 +856,8 @@ create_replication_slot(PGconn *conn, char *slot_name)
if (strcmp(PQgetvalue(res, 0, 0), "f") == 0)
{
PQclear(res);
- log_debug(_("Replication slot '%s' exists but is inactive; reusing\n"),
- slot_name);
+ log_debug("Replication slot '%s' exists but is inactive; reusing\n",
+ slot_name);
return true;
}
@@ -842,6 +872,7 @@ create_replication_slot(PGconn *conn, char *slot_name)
slot_name);
log_debug(_("create_replication_slot(): Creating slot '%s' on primary\n"), slot_name);
+ log_verbose(LOG_DEBUG, "create_replication_slot():\n%s\n", sqlquery);
res = PQexec(conn, sqlquery);
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
@@ -857,6 +888,33 @@ create_replication_slot(PGconn *conn, char *slot_name)
return true;
}
+bool
+drop_replication_slot(PGconn *conn, char *slot_name)
+{
+ char sqlquery[QUERY_STR_LEN];
+ PGresult *res;
+ sqlquery_snprintf(sqlquery,
+ "SELECT pg_drop_replication_slot('%s')",
+ slot_name);
+
+ log_verbose(LOG_DEBUG, "drop_replication_slot():\n%s\n", sqlquery);
+
+ res = PQexec(conn, sqlquery);
+ if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ log_err(_("unable to drop replication slot \"%s\":\n %s\n"),
+ slot_name,
+ PQerrorMessage(conn));
+ PQclear(res);
+ return false;
+ }
+
+ log_verbose(LOG_DEBUG, "replication slot \"%s\" successfully dropped\n",
+ slot_name);
+
+ return true;
+}
+
bool
start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint)
@@ -865,11 +923,11 @@ start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint)
PGresult *res;
sqlquery_snprintf(sqlquery,
- "SELECT pg_xlogfile_name(pg_start_backup('repmgr_standby_clone_%ld', %s))",
+ "SELECT pg_catalog.pg_xlogfile_name(pg_catalog.pg_start_backup('repmgr_standby_clone_%ld', %s))",
time(NULL),
fast_checkpoint ? "TRUE" : "FALSE");
- log_debug(_("standby clone: %s\n"), sqlquery);
+ log_verbose(LOG_DEBUG, "start_backup():\n%s\n", sqlquery);
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
@@ -884,7 +942,7 @@ start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint)
char *first_wal_seg_pq = PQgetvalue(res, 0, 0);
size_t buf_sz = strlen(first_wal_seg_pq);
- first_wal_segment = malloc(buf_sz + 1);
+ first_wal_segment = pg_malloc0(buf_sz + 1);
xsnprintf(first_wal_segment, buf_sz + 1, "%s", first_wal_seg_pq);
}
@@ -900,7 +958,7 @@ stop_backup(PGconn *conn, char *last_wal_segment)
char sqlquery[QUERY_STR_LEN];
PGresult *res;
- sqlquery_snprintf(sqlquery, "SELECT pg_xlogfile_name(pg_stop_backup())");
+ sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_xlogfile_name(pg_catalog.pg_stop_backup())");
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
@@ -915,7 +973,7 @@ stop_backup(PGconn *conn, char *last_wal_segment)
char *last_wal_seg_pq = PQgetvalue(res, 0, 0);
size_t buf_sz = strlen(last_wal_seg_pq);
- last_wal_segment = malloc(buf_sz + 1);
+ last_wal_segment = pg_malloc0(buf_sz + 1);
xsnprintf(last_wal_segment, buf_sz + 1, "%s", last_wal_seg_pq);
}
@@ -936,6 +994,8 @@ set_config_bool(PGconn *conn, const char *config_param, bool state)
config_param,
state ? "TRUE" : "FALSE");
+ log_verbose(LOG_DEBUG, "set_config_bool():\n%s\n", sqlquery);
+
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
@@ -967,11 +1027,13 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
int i;
sqlquery_snprintf(sqlquery, "TRUNCATE TABLE %s.repl_nodes", get_repmgr_schema_quoted(witnessconn));
- log_debug("copy_configuration: %s\n", sqlquery);
+
+ log_verbose(LOG_DEBUG, "copy_configuration():\n%s\n", sqlquery);
+
res = PQexec(witnessconn, sqlquery);
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
{
- fprintf(stderr, "Cannot clean node details in the witness, %s\n",
+ log_err(_("Unable to truncate witness servers's repl_nodes table:\n%s\n"),
PQerrorMessage(witnessconn));
return false;
}
@@ -979,10 +1041,13 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
sqlquery_snprintf(sqlquery,
"SELECT id, type, upstream_node_id, name, conninfo, priority, slot_name FROM %s.repl_nodes",
get_repmgr_schema_quoted(masterconn));
+
+ log_verbose(LOG_DEBUG, "copy_configuration():\n%s\n", sqlquery);
+
res = PQexec(masterconn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
- fprintf(stderr, "Can't get configuration from master: %s\n",
+ log_err("Unable to retrieve node records from master:\n%s\n",
PQerrorMessage(masterconn));
PQclear(res);
return false;
@@ -991,9 +1056,11 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
for (i = 0; i < PQntuples(res); i++)
{
bool node_record_created;
- char *witness = PQgetvalue(res, i, 4);
- log_debug(_("copy_configuration(): %s\n"), witness);
+ log_verbose(LOG_DEBUG,
+ "copy_configuration(): writing node record for node %s (id: %s)\n",
+ PQgetvalue(res, i, 4),
+ PQgetvalue(res, i, 0));
node_record_created = create_node_record(witnessconn,
"copy_configuration",
@@ -1013,7 +1080,9 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
if (node_record_created == false)
{
- fprintf(stderr, "Unable to copy node record to witness database: %s\n",
+ PQclear(res);
+
+ log_err("Unable to copy node record to witness database\n%s\n",
PQerrorMessage(witnessconn));
return false;
}
@@ -1069,6 +1138,7 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea
maxlen_snprintf(slot_name_buf, "%s", "NULL");
}
+ /* XXX convert to placeholder query */
sqlquery_snprintf(sqlquery,
"INSERT INTO %s.repl_nodes "
" (id, type, upstream_node_id, cluster, "
@@ -1084,16 +1154,18 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea
slot_name_buf,
priority);
+ log_verbose(LOG_DEBUG, "create_node_record(): %s\n", sqlquery);
+
if (action != NULL)
{
- log_debug(_("%s: %s\n"), action, sqlquery);
+ log_verbose(LOG_DEBUG, "create_node_record(): action is \"%s\"\n", action);
}
res = PQexec(conn, sqlquery);
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
{
- log_warning(_("Unable to create node record: %s\n"),
- PQerrorMessage(conn));
+ log_err(_("Unable to create node record\n%s\n"),
+ PQerrorMessage(conn));
PQclear(res);
return false;
}
@@ -1115,15 +1187,18 @@ delete_node_record(PGconn *conn, int node, char *action)
" WHERE id = %d",
get_repmgr_schema_quoted(conn),
node);
+
+ log_verbose(LOG_DEBUG, "delete_node_record(): %s\n", sqlquery);
+
if (action != NULL)
{
- log_debug(_("%s: %s\n"), action, sqlquery);
+ log_verbose(LOG_DEBUG, "create_node_record(): action is \"%s\"\n", action);
}
res = PQexec(conn, sqlquery);
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
{
- log_warning(_("Unable to delete node record: %s\n"),
+ log_err(_("Unable to delete node record: %s\n"),
PQerrorMessage(conn));
PQclear(res);
return false;
@@ -1195,6 +1270,8 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
" RETURNING event_timestamp ",
get_repmgr_schema_quoted(conn));
+ log_verbose(LOG_DEBUG, "create_event_record():\n%s\n", sqlquery);
+
res = PQexecParams(conn,
sqlquery,
4,
@@ -1206,7 +1283,6 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
{
-
log_warning(_("Unable to create event record: %s\n"),
PQerrorMessage(conn));
@@ -1217,7 +1293,7 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
{
/* Store timestamp to send to the notification command */
strncpy(event_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
- log_debug(_("Event timestamp is: %s\n"), event_timestamp);
+ log_verbose(LOG_DEBUG, "create_event_record(): Event timestamp is \"%s\"\n", event_timestamp);
}
PQclear(res);
@@ -1337,12 +1413,13 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
*dst_ptr = '\0';
- log_debug(_("Executing: %s\n"), parsed_command);
+ log_debug("create_event_record(): executing\n%s\n", parsed_command);
r = system(parsed_command);
if (r != 0)
{
log_warning(_("Unable to execute event notification command\n"));
+ log_info(_("Parsed event notification command was:\n%s\n"), parsed_command);
success = false;
}
}
@@ -1350,6 +1427,50 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
return success;
}
+/*
+ * Update node record following change of status
+ * (e.g. inactive primary converted to standby)
+ */
+bool
+update_node_record_status(PGconn *conn, char *cluster_name, int this_node_id, char *type, int upstream_node_id, bool active)
+{
+ PGresult *res;
+ char sqlquery[QUERY_STR_LEN];
+
+ sqlquery_snprintf(sqlquery,
+ " UPDATE %s.repl_nodes "
+ " SET type = '%s', "
+ " upstream_node_id = %i, "
+ " active = %s "
+ " WHERE cluster = '%s' "
+ " AND id = %i ",
+ get_repmgr_schema_quoted(conn),
+ type,
+ upstream_node_id,
+ active ? "TRUE" : "FALSE",
+ cluster_name,
+ this_node_id);
+
+ log_verbose(LOG_DEBUG, "update_node_record_status():\n%s\n", sqlquery);
+
+ res = PQexec(conn, sqlquery);
+
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ {
+ log_err(_("Unable to update node record: %s\n"),
+ PQerrorMessage(conn));
+ PQclear(res);
+
+ return false;
+ }
+
+ PQclear(res);
+
+ return true;
+
+}
+
+
bool
update_node_record_set_upstream(PGconn *conn, char *cluster_name, int this_node_id, int new_upstream_node_id)
{
@@ -1367,6 +1488,9 @@ update_node_record_set_upstream(PGconn *conn, char *cluster_name, int this_node_
new_upstream_node_id,
cluster_name,
this_node_id);
+
+ log_verbose(LOG_DEBUG, "update_node_record_set_upstream():\n%s\n", sqlquery);
+
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
@@ -1398,7 +1522,7 @@ get_node_record(PGconn *conn, char *cluster, int node_id)
cluster,
node_id);
- log_debug("get_node_record(): %s\n", sqlquery);
+ log_verbose(LOG_DEBUG, "get_node_record():\n%s\n", sqlquery);
return PQexec(conn, sqlquery);
}
diff --git a/dbutils.h b/dbutils.h
index caa4a2f..5232ed8 100644
--- a/dbutils.h
+++ b/dbutils.h
@@ -20,10 +20,51 @@
#ifndef _REPMGR_DBUTILS_H_
#define _REPMGR_DBUTILS_H_
+#include "access/xlogdefs.h"
+
#include "config.h"
#include "strutil.h"
+typedef enum {
+ UNKNOWN = 0,
+ MASTER,
+ STANDBY,
+ WITNESS
+} t_server_type;
+
+/*
+ * Struct to store node information
+ */
+typedef struct s_node_info
+{
+ int node_id;
+ int upstream_node_id;
+ t_server_type type;
+ char name[MAXLEN];
+ char conninfo_str[MAXLEN];
+ char slot_name[MAXLEN];
+ int priority;
+ bool active;
+ bool is_ready;
+ bool is_visible;
+ XLogRecPtr xlog_location;
+} t_node_info;
+
+
+#define T_NODE_INFO_INITIALIZER { \
+ NODE_NOT_FOUND, \
+ NO_UPSTREAM_NODE, \
+ UNKNOWN, \
+ "", \
+ "", \
+ "", \
+ DEFAULT_PRIORITY, \
+ true, \
+ false, \
+ false, \
+ InvalidXLogRecPtr \
+}
PGconn *establish_db_connection(const char *conninfo,
const bool exit_on_error);
@@ -58,6 +99,7 @@ bool cancel_query(PGconn *conn, int timeout);
char *get_repmgr_schema(void);
char *get_repmgr_schema_quoted(PGconn *conn);
bool create_replication_slot(PGconn *conn, char *slot_name);
+bool drop_replication_slot(PGconn *conn, char *slot_name);
bool start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint);
bool stop_backup(PGconn *conn, char *last_wal_segment);
@@ -66,6 +108,7 @@ bool copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_
bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name);
bool delete_node_record(PGconn *conn, int node, char *action);
bool create_event_record(PGconn *conn, t_configuration_options *options, int node_id, char *event, bool successful, char *details);
+bool update_node_record_status(PGconn *conn, char *cluster_name, int this_node_id, char *type, int upstream_node_id, bool active);
bool update_node_record_set_upstream(PGconn *conn, char *cluster_name, int this_node_id, int new_upstream_node_id);
PGresult * get_node_record(PGconn *conn, char *cluster, int node_id);
diff --git a/debian/changelog b/debian/changelog
index 0c68899..28b95de 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,17 @@
+repmgr (3.0.3-2) unstable; urgency=medium
+
+ * Update debian/changelog to refer the postgresql-9.5 transition RC bug.
+
+ -- Marco Nenciarini <[email protected]> Sat, 23 Jan 2016 10:08:27 +0100
+
+repmgr (3.0.3-1) unstable; urgency=medium
+
+ * Update for postgresql-9.5 transition (Closes: #811139)
+ * Fix build on PostgreSQL older than the current libpq
+ * Imported Upstream version 3.0.3
+
+ -- Marco Nenciarini <[email protected]> Fri, 22 Jan 2016 14:50:10 +0100
+
repmgr (3.0.2-2~bpo70+1) wheezy-backports-sloppy; urgency=low
* Rebuild for wheezy-backports-sloppy to get PostgreSQL 9.4 support.
diff --git a/debian/control b/debian/control
index eaa4c5b..4a933a5 100644
--- a/debian/control
+++ b/debian/control
@@ -37,6 +37,7 @@ Description: replication manager for PostgreSQL common files
Package: repmgr
Architecture: all
Depends:
+ postgresql-9.5-repmgr |
postgresql-9.4-repmgr |
postgresql-9.3-repmgr |
postgresql-9.2-repmgr |
@@ -59,6 +60,7 @@ Package: repmgr-dbg
Section: debug
Architecture: all
Depends:
+ postgresql-9.5-repmgr-dbg |
postgresql-9.4-repmgr-dbg |
postgresql-9.3-repmgr-dbg |
postgresql-9.2-repmgr-dbg |
@@ -68,11 +70,11 @@ Depends:
Description: transitional dummy package
This is a transitional dummy package. It can safely be removed.
-Package: postgresql-9.4-repmgr
+Package: postgresql-9.5-repmgr
Architecture: any
-Depends: postgresql-9.4, repmgr-common (>= ${source:Version}), ${misc:Depends}, ${shlibs:Depends}
+Depends: postgresql-9.5, repmgr-common (>= ${source:Version}), ${misc:Depends}, ${shlibs:Depends}
Suggests: rsync
-Description: replication manager for PostgreSQL 9.4
+Description: replication manager for PostgreSQL 9.5
Since version 9.0, PostgreSQL allow you to have replicated hot
standby servers which you can query and/or use for high availability.
While the main components of the feature are included with
@@ -82,15 +84,15 @@ Description: replication manager for PostgreSQL 9.4
repmgr allows you to monitor and manage your replicated PostgreSQL
databases as a single cluster.
.
- This package contains repmgr for PostgreSQL 9.4.
+ This package contains repmgr for PostgreSQL 9.5.
-Package: postgresql-9.4-repmgr-dbg
+Package: postgresql-9.5-repmgr-dbg
Section: debug
Architecture: any
-Depends: postgresql-9.4-repmgr (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
+Depends: postgresql-9.5-repmgr (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
Replaces: repmgr-dbg (<< 3.0.1-1)
Breaks: repmgr-dbg (<< 3.0.1-1)
-Description: debug symbols for repmgr for PostgreSQL 9.4
+Description: debug symbols for repmgr for PostgreSQL 9.5
Since version 9.0, PostgreSQL allow you to have replicated hot
standby servers which you can query and/or use for high availability.
While the main components of the feature are included with
@@ -100,5 +102,5 @@ Description: debug symbols for repmgr for PostgreSQL 9.4
repmgr allows you to monitor and manage your replicated PostgreSQL
databases as a single cluster.
.
- This package contains debug symbols for repmgr for PostgreSQL 9.4.
+ This package contains debug symbols for repmgr for PostgreSQL 9.5.
diff --git a/debian/control.in b/debian/control.in
index 52d74c1..66202a1 100644
--- a/debian/control.in
+++ b/debian/control.in
@@ -37,6 +37,7 @@ Description: replication manager for PostgreSQL common files
Package: repmgr
Architecture: all
Depends:
+ postgresql-9.5-repmgr |
postgresql-9.4-repmgr |
postgresql-9.3-repmgr |
postgresql-9.2-repmgr |
@@ -93,6 +94,7 @@ Package: repmgr-dbg
Section: debug
Architecture: all
Depends:
+ postgresql-9.5-repmgr-dbg |
postgresql-9.4-repmgr-dbg |
postgresql-9.3-repmgr-dbg |
postgresql-9.2-repmgr-dbg |
diff --git a/debian/patches/makefile-libpq-internal.patch b/debian/patches/makefile-libpq-internal.patch
new file mode 100644
index 0000000..b1e13ff
--- /dev/null
+++ b/debian/patches/makefile-libpq-internal.patch
@@ -0,0 +1,24 @@
+From: Marco Nenciarini <[email protected]>
+Date: Fri, 22 Jan 2016 14:19:45 +0100
+Subject: Fix build on PostgreSQL older than the current libpq
+
+ Make sure that the includedir_internal directory is used before the
+ includedir_server, otherwise the build may fail for PostgreSQL
+ version lower than the libpq version.
+---
+ Makefile | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/Makefile b/Makefile
+index 75d1806..aa9095d 100644
+--- a/Makefile
++++ b/Makefile
+@@ -8,7 +8,7 @@ repmgr_OBJS = dbutils.o check_dir.o config.o repmgr.o log.o strutil.o
+ DATA = repmgr.sql uninstall_repmgr.sql
+ REGRESS = repmgr_funcs repmgr_test
+
+-PG_CPPFLAGS = -I$(libpq_srcdir)
++PG_CPPFLAGS = -I$(includedir_internal) -I$(libpq_srcdir)
+ PG_LIBS = $(libpq_pgport)
+
+ all: repmgrd repmgr
diff --git a/debian/patches/makefile-no-libs.patch b/debian/patches/makefile-no-libs.patch
index 2d2ef6d..0707c53 100644
--- a/debian/patches/makefile-no-libs.patch
+++ b/debian/patches/makefile-no-libs.patch
@@ -7,7 +7,7 @@ Subject: makefile-no-libs
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/Makefile b/Makefile
-index 397d484..1311d11 100644
+index f79f87e..75d1806 100644
--- a/Makefile
+++ b/Makefile
@@ -15,11 +15,11 @@ all: repmgrd repmgr
diff --git a/debian/patches/regress.patch b/debian/patches/regress.patch
index cfe8c1e..e1379eb 100644
--- a/debian/patches/regress.patch
+++ b/debian/patches/regress.patch
@@ -13,7 +13,7 @@ Subject: regress
create mode 100644 sql/repmgr_test.sql
diff --git a/Makefile b/Makefile
-index 1cf5047..397d484 100644
+index e4e48e3..f79f87e 100644
--- a/Makefile
+++ b/Makefile
@@ -6,6 +6,7 @@ repmgrd_OBJS = dbutils.o config.o repmgrd.o log.o strutil.o
@@ -26,7 +26,7 @@ index 1cf5047..397d484 100644
PG_LIBS = $(libpq_pgport)
diff --git a/expected/repmgr_funcs.out b/expected/repmgr_funcs.out
new file mode 100644
-index 0000000..eb64530
+index 0000000..6a6bf9f
--- /dev/null
+++ b/expected/repmgr_funcs.out
@@ -0,0 +1,18 @@
diff --git a/debian/patches/series b/debian/patches/series
index baf0a57..eaf0849 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,2 +1,3 @@
regress.patch
makefile-no-libs.patch
+makefile-libpq-internal.patch
diff --git a/errcode.h b/errcode.h
index a67f513..b6ebd73 100644
--- a/errcode.h
+++ b/errcode.h
@@ -36,5 +36,6 @@
#define ERR_SYS_FAILURE 13
#define ERR_BAD_BASEBACKUP 14
#define ERR_INTERNAL 15
+#define ERR_MONITORING_FAIL 16
#endif /* _ERRCODE_H_ */
diff --git a/log.c b/log.c
index 7d738b9..6a43c79 100644
--- a/log.c
+++ b/log.c
@@ -39,13 +39,37 @@
/* #define REPMGR_DEBUG */
+static int detect_log_facility(const char *facility);
+static void _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap);
+
+int log_type = REPMGR_STDERR;
+int log_level = LOG_NOTICE;
+int last_log_level = LOG_NOTICE;
+int verbose_logging = false;
+int terse_logging = false;
+
void
stderr_log_with_level(const char *level_name, int level, const char *fmt, ...)
{
+ va_list arglist;
+
+ va_start(arglist, fmt);
+ _stderr_log_with_level(level_name, level, fmt, arglist);
+ va_end(arglist);
+}
+
+static void
+_stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap)
+{
time_t t;
struct tm *tm;
char buff[100];
- va_list ap;
+
+ /*
+ * Store the requested level so that if there's a subsequent
+ * log_hint(), we can suppress that if appropriate.
+ */
+ last_log_level = level;
if (log_level >= level)
{
@@ -54,24 +78,74 @@ stderr_log_with_level(const char *level_name, int level, const char *fmt, ...)
strftime(buff, 100, "[%Y-%m-%d %H:%M:%S]", tm);
fprintf(stderr, "%s [%s] ", buff, level_name);
- va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
- va_end(ap);
fflush(stderr);
}
}
+void
+log_hint(const char *fmt, ...)
+{
+ va_list ap;
+
+ if (terse_logging == false)
+ {
+ va_start(ap, fmt);
+ _stderr_log_with_level("HINT", last_log_level, fmt, ap);
+ va_end(ap);
+ }
+}
+
-static int detect_log_level(const char *level);
-static int detect_log_facility(const char *facility);
+void
+log_verbose(int level, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ if (verbose_logging == true)
+ {
+ switch(level)
+ {
+ case LOG_EMERG:
+ _stderr_log_with_level("EMERG", level, fmt, ap);
+ break;
+ case LOG_ALERT:
+ _stderr_log_with_level("ALERT", level, fmt, ap);
+ break;
+ case LOG_CRIT:
+ _stderr_log_with_level("CRIT", level, fmt, ap);
+ break;
+ case LOG_ERR:
+ _stderr_log_with_level("ERR", level, fmt, ap);
+ break;
+ case LOG_WARNING:
+ _stderr_log_with_level("WARNING", level, fmt, ap);
+ break;
+ case LOG_NOTICE:
+ _stderr_log_with_level("NOTICE", level, fmt, ap);
+ break;
+ case LOG_INFO:
+ _stderr_log_with_level("INFO", level, fmt, ap);
+ break;
+ case LOG_DEBUG:
+ _stderr_log_with_level("DEBUG", level, fmt, ap);
+ break;
+ }
+ }
+
+ va_end(ap);
+}
-int log_type = REPMGR_STDERR;
-int log_level = LOG_NOTICE;
bool
-logger_init(t_configuration_options * opts, const char *ident, const char *level, const char *facility)
+logger_init(t_configuration_options * opts, const char *ident)
{
+ char *level = opts->loglevel;
+ char *facility = opts->logfacility;
+
int l;
int f;
@@ -95,10 +169,10 @@ logger_init(t_configuration_options * opts, const char *ident, const char *level
printf("Assigned level for logger: %d\n", l);
#endif
- if (l > 0)
+ if (l >= 0)
log_level = l;
else
- stderr_log_warning(_("Cannot detect log level %s (use any of DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG)\n"), level);
+ stderr_log_warning(_("Invalid log level \"%s\" (available values: DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG)\n"), level);
}
if (facility && *facility)
@@ -174,9 +248,9 @@ logger_init(t_configuration_options * opts, const char *ident, const char *level
}
return true;
-
}
+
bool
logger_shutdown(void)
{
@@ -189,17 +263,32 @@ logger_shutdown(void)
}
/*
- * Set a minimum logging level. Intended for command line verbosity
- * options, which might increase requested logging over what's specified
- * in the regular configuration file.
+ * Indicate whether extra-verbose logging is required. This will
+ * generate a lot of output, particularly debug logging, and should
+ * not be permanently enabled in production.
+ *
+ * NOTE: in previous repmgr versions, this option forced the log
+ * level to INFO.
*/
void
-logger_min_verbose(int minimum)
+logger_set_verbose(void)
{
- if (log_level < minimum)
- log_level = minimum;
+ verbose_logging = true;
}
+
+/*
+ * Indicate whether some non-critical log messages can be omitted.
+ * Currently this includes warnings about irrelevant command line
+ * options and hints.
+ */
+
+void logger_set_terse(void)
+{
+ terse_logging = true;
+}
+
+
int
detect_log_level(const char *level)
{
@@ -220,17 +309,16 @@ detect_log_level(const char *level)
if (!strcmp(level, "EMERG"))
return LOG_EMERG;
- return 0;
+ return -1;
}
-int
+static int
detect_log_facility(const char *facility)
{
int local = 0;
if (!strncmp(facility, "LOCAL", 5) && strlen(facility) == 6)
{
-
local = atoi(&facility[5]);
switch (local)
diff --git a/log.h b/log.h
index f6b80c7..b74f1db 100644
--- a/log.h
+++ b/log.h
@@ -112,13 +112,19 @@ __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
#endif
+int detect_log_level(const char *level);
+
/* Logger initialisation and shutdown */
+
+bool logger_init(t_configuration_options * opts, const char *ident);
+
bool logger_shutdown(void);
-bool logger_init(t_configuration_options * opts, const char *ident,
- const char *level, const char *facility);
+void logger_set_verbose(void);
+void logger_set_terse(void);
-void logger_min_verbose(int minimum);
+void log_hint(const char *fmt, ...);
+void log_verbose(int level, const char *fmt, ...);
extern int log_type;
extern int log_level;
diff --git a/repmgr.c b/repmgr.c
index a41f622..f15491f 100644
--- a/repmgr.c
+++ b/repmgr.c
@@ -101,22 +101,20 @@ static void do_cluster_show(void);
static void do_cluster_cleanup(void);
static void do_check_upstream_config(void);
-static void error_list_append(char *error_message);
static void exit_with_errors(void);
-static void help(const char *progname);
+static void print_error_list(ErrorList *error_list, int log_level);
+static void help(void);
/* Global variables */
-static const char *progname;
static const char *keywords[6];
static const char *values[6];
static bool config_file_required = true;
-/* XXX This should be mapped into a command line option */
-bool require_password = false;
-
/* Initialization of runtime options */
t_runtime_options runtime_options = T_RUNTIME_OPTIONS_INITIALIZER;
t_configuration_options options = T_CONFIGURATION_OPTIONS_INITIALIZER;
+static bool wal_keep_segments_used = false;
+
static char *server_mode = NULL;
static char *server_cmd = NULL;
@@ -126,8 +124,10 @@ static char repmgr_slot_name[MAXLEN] = "";
static char *repmgr_slot_name_ptr = NULL;
static char path_buf[MAXLEN] = "";
-/* Collate command line errors here for friendlier reporting */
-static ErrorList cli_errors = { NULL, NULL };
+/* Collate command line errors and warnings here for friendlier reporting */
+ErrorList cli_errors = { NULL, NULL };
+ErrorList cli_warnings = { NULL, NULL };
+
int
main(int argc, char **argv)
@@ -151,6 +151,8 @@ main(int argc, char **argv)
{"pg_bindir", required_argument, NULL, 'b'},
{"rsync-only", no_argument, NULL, 'r'},
{"fast-checkpoint", no_argument, NULL, 'c'},
+ {"log-level", required_argument, NULL, 'L'},
+ {"terse", required_argument, NULL, 't'},
{"initdb-no-pwprompt", no_argument, NULL, 1},
{"check-upstream-config", no_argument, NULL, 2},
{"recovery-min-apply-delay", required_argument, NULL, 3},
@@ -164,25 +166,31 @@ main(int argc, char **argv)
int c, targ;
int action = NO_ACTION;
bool check_upstream_config = false;
- bool wal_keep_segments_used = false;
bool config_file_parsed = false;
char *ptr = NULL;
- progname = get_progname(argv[0]);
+ set_progname(argv[0]);
+
/* Prevent getopt_long() from printing an error message */
opterr = 0;
- while ((c = getopt_long(argc, argv, "?Vd:h:p:U:S:D:l:f:R:w:k:FWIvb:r:c", long_options,
+ while ((c = getopt_long(argc, argv, "?Vd:h:p:U:S:D:l:f:R:w:k:FWIvb:rcL:t", long_options,
&optindex)) != -1)
{
+ /*
+ * NOTE: some integer parameters (e.g. -p/--port) are stored internally
+ * as strings. We use repmgr_atoi() to check these but discard the
+ * returned integer; repmgr_atoi() will append the error message to the
+ * provided list.
+ */
switch (c)
{
case '?':
- help(progname);
+ help();
exit(SUCCESS);
case 'V':
- printf("%s %s (PostgreSQL %s)\n", progname, REPMGR_VERSION, PG_VERSION);
+ printf("%s %s (PostgreSQL %s)\n", progname(), REPMGR_VERSION, PG_VERSION);
exit(SUCCESS);
case 'd':
strncpy(runtime_options.dbname, optarg, MAXLEN);
@@ -191,8 +199,10 @@ main(int argc, char **argv)
strncpy(runtime_options.host, optarg, MAXLEN);
break;
case 'p':
- if (atoi(optarg) > 0)
- strncpy(runtime_options.masterport, optarg, MAXLEN);
+ repmgr_atoi(optarg, "-p/--port", &cli_errors);
+ strncpy(runtime_options.masterport,
+ optarg,
+ MAXLEN);
break;
case 'U':
strncpy(runtime_options.username, optarg, MAXLEN);
@@ -204,8 +214,11 @@ main(int argc, char **argv)
strncpy(runtime_options.dest_dir, optarg, MAXFILENAME);
break;
case 'l':
- if (atoi(optarg) > 0)
- strncpy(runtime_options.localport, optarg, MAXLEN);
+ /* -l/--local-port is deprecated */
+ repmgr_atoi(optarg, "-l/--local-port", &cli_errors);
+ strncpy(runtime_options.localport,
+ optarg,
+ MAXLEN);
break;
case 'f':
strncpy(runtime_options.config_file, optarg, MAXLEN);
@@ -214,17 +227,14 @@ main(int argc, char **argv)
strncpy(runtime_options.remote_user, optarg, MAXLEN);
break;
case 'w':
- if (atoi(optarg) > 0)
- {
- strncpy(runtime_options.wal_keep_segments, optarg, MAXLEN);
- wal_keep_segments_used = true;
- }
+ repmgr_atoi(optarg, "-w/--wal-keep-segments", &cli_errors);
+ strncpy(runtime_options.wal_keep_segments,
+ optarg,
+ MAXLEN);
+ wal_keep_segments_used = true;
break;
case 'k':
- if (atoi(optarg) > 0)
- runtime_options.keep_history = atoi(optarg);
- else
- runtime_options.keep_history = 0;
+ runtime_options.keep_history = repmgr_atoi(optarg, "-k/--keep-history", &cli_errors);
break;
case 'F':
runtime_options.force = true;
@@ -247,6 +257,25 @@ main(int argc, char **argv)
case 'c':
runtime_options.fast_checkpoint = true;
break;
+ case 'L':
+ {
+ int detected_log_level = detect_log_level(optarg);
+ if (detected_log_level != -1)
+ {
+ strncpy(runtime_options.loglevel, optarg, MAXLEN);
+ }
+ else
+ {
+ PQExpBufferData invalid_log_level;
+ initPQExpBuffer(&invalid_log_level);
+ appendPQExpBuffer(&invalid_log_level, _("Invalid log level \"%s\" provided"), optarg);
+ error_list_append(&cli_errors, invalid_log_level.data);
+ }
+ break;
+ }
+ case 't':
+ runtime_options.terse = true;
+ break;
case 1:
runtime_options.initdb_no_pwprompt = true;
break;
@@ -258,7 +287,7 @@ main(int argc, char **argv)
if (targ < 1)
{
- error_list_append(_("Invalid value provided for '-r/--recovery-min-apply-delay'"));
+ error_list_append(&cli_errors, _("Invalid value provided for '-r/--recovery-min-apply-delay'"));
break;
}
if (ptr && *ptr)
@@ -267,7 +296,7 @@ main(int argc, char **argv)
strcmp(ptr, "min") != 0 && strcmp(ptr, "h") != 0 &&
strcmp(ptr, "d") != 0)
{
- error_list_append(_("Value provided for '-r/--recovery-min-apply-delay' must be one of ms/s/min/h/d"));
+ error_list_append(&cli_errors, _("Value provided for '-r/--recovery-min-apply-delay' must be one of ms/s/min/h/d"));
break;
}
}
@@ -283,11 +312,12 @@ main(int argc, char **argv)
initPQExpBuffer(&unknown_option);
appendPQExpBuffer(&unknown_option, _("Unknown option '%s'"), argv[optind - 1]);
- error_list_append(unknown_option.data);
+ error_list_append(&cli_errors, unknown_option.data);
}
}
}
+
/* Exit here already if errors in command line options found */
if (cli_errors.head != NULL)
{
@@ -324,7 +354,7 @@ main(int argc, char **argv)
PQExpBufferData unknown_mode;
initPQExpBuffer(&unknown_mode);
appendPQExpBuffer(&unknown_mode, _("Unknown server mode '%s'"), server_mode);
- error_list_append(unknown_mode.data);
+ error_list_append(&cli_errors, unknown_mode.data);
}
}
@@ -367,14 +397,14 @@ main(int argc, char **argv)
if (action == NO_ACTION) {
if (server_cmd == NULL)
{
- error_list_append("No server command provided");
+ error_list_append(&cli_errors, "No server command provided");
}
else
{
PQExpBufferData unknown_action;
initPQExpBuffer(&unknown_action);
appendPQExpBuffer(&unknown_action, _("Unknown server command '%s'"), server_cmd);
- error_list_append(unknown_action.data);
+ error_list_append(&cli_errors, unknown_action.data);
}
}
@@ -385,7 +415,7 @@ main(int argc, char **argv)
{
if (runtime_options.host[0])
{
- error_list_append(_("Conflicting parameters: you can't use -h while providing a node separately."));
+ error_list_append(&cli_errors, _("Conflicting parameters: you can't use -h while providing a node separately."));
}
else
{
@@ -399,7 +429,7 @@ main(int argc, char **argv)
PQExpBufferData too_many_args;
initPQExpBuffer(&too_many_args);
appendPQExpBuffer(&too_many_args, _("too many command-line arguments (first extra is \"%s\")"), argv[optind]);
- error_list_append(too_many_args.data);
+ error_list_append(&cli_errors, too_many_args.data);
}
check_parameters_for_action(action);
@@ -413,6 +443,11 @@ main(int argc, char **argv)
exit_with_errors();
}
+ if (cli_warnings.head != NULL && runtime_options.terse == false)
+ {
+ print_error_list(&cli_warnings, LOG_WARNING);
+ }
+
if (!runtime_options.dbname[0])
{
if (getenv("PGDATABASE"))
@@ -424,7 +459,7 @@ main(int argc, char **argv)
}
/*
- * If no primary port (-p, --port) provided, explicitly set the
+ * If no primary port (-p/--port) provided, explicitly set the
* default PostgreSQL port.
*/
if (!runtime_options.masterport[0])
@@ -432,19 +467,15 @@ main(int argc, char **argv)
strncpy(runtime_options.masterport, DEFAULT_MASTER_PORT, MAXLEN);
}
-
- if (runtime_options.verbose && runtime_options.config_file[0])
- {
- log_notice(_("opening configuration file: %s\n"),
- runtime_options.config_file);
- }
-
/*
* The configuration file is not required for some actions (e.g. 'standby clone'),
* however if available we'll parse it anyway for options like 'log_level',
* 'use_replication_slots' etc.
*/
- config_file_parsed = load_config(runtime_options.config_file, &options, argv[0]);
+ config_file_parsed = load_config(runtime_options.config_file,
+ runtime_options.verbose,
+ &options,
+ argv[0]);
/*
* Initialise pg_bindir - command line parameter will override
@@ -474,7 +505,7 @@ main(int argc, char **argv)
keywords[3] = "dbname";
values[3] = runtime_options.dbname;
keywords[4] = "application_name";
- values[4] = (char *) progname;
+ values[4] = (char *) progname();
keywords[5] = NULL;
values[5] = NULL;
@@ -485,9 +516,21 @@ main(int argc, char **argv)
* logging level might be specified at, but it often requires detailed
* logging to troubleshoot problems.
*/
- logger_init(&options, progname, options.loglevel, options.logfacility);
+
+ /* Command-line parameter -L/--log-level overrides any setting in config file*/
+ if (*runtime_options.loglevel != '\0')
+ {
+ strncpy(options.loglevel, runtime_options.loglevel, MAXLEN);
+ }
+
+ logger_init(&options, progname());
+
if (runtime_options.verbose)
- logger_min_verbose(LOG_INFO);
+ logger_set_verbose();
+
+ if (runtime_options.terse)
+ logger_set_terse();
+
/*
* Node configuration information is not needed for all actions, with
@@ -540,6 +583,7 @@ main(int argc, char **argv)
{
maxlen_snprintf(repmgr_slot_name, "repmgr_slot_%i", options.node);
repmgr_slot_name_ptr = repmgr_slot_name;
+ log_verbose(LOG_DEBUG, "slot name initialised as: %s\n", repmgr_slot_name);
}
@@ -606,7 +650,7 @@ do_cluster_show(void)
{
log_err(_("Unable to retrieve node information from the database\n%s\n"),
PQerrorMessage(conn));
- log_notice(_("HINT: Please check that all nodes have been registered\n"));
+ log_hint(_("Please check that all nodes have been registered\n"));
PQclear(res);
PQfinish(conn);
@@ -643,6 +687,7 @@ do_cluster_cleanup(void)
PGconn *master_conn = NULL;
PGresult *res;
char sqlquery[QUERY_STR_LEN];
+ int entries_to_delete = 0;
/* We need to connect to check configuration */
log_info(_("connecting to database\n"));
@@ -660,6 +705,37 @@ do_cluster_cleanup(void)
}
PQfinish(conn);
+ log_debug(_("Number of days of monitoring history to retain: %i\n"), runtime_options.keep_history);
+
+ sqlquery_snprintf(sqlquery,
+ "SELECT COUNT(*) "
+ " FROM %s.repl_monitor "
+ " WHERE age(now(), last_monitor_time) >= '%d days'::interval ",
+ get_repmgr_schema_quoted(master_conn),
+ runtime_options.keep_history);
+
+ res = PQexec(master_conn, sqlquery);
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ log_err(_("cluster cleanup: unable to query number of monitoring records to clean up:\n%s\n"),
+ PQerrorMessage(master_conn));
+ PQclear(res);
+ PQfinish(master_conn);
+ exit(ERR_DB_QUERY);
+ }
+
+ entries_to_delete = atoi(PQgetvalue(res, 0, 0));
+ PQclear(res);
+
+ if (entries_to_delete == 0)
+ {
+ log_info(_("cluster cleanup: no monitoring records to delete\n"));
+ PQfinish(master_conn);
+ return;
+ }
+
+ log_debug(_("cluster cleanup: at least %i monitoring records to delete\n"), entries_to_delete);
+
if (runtime_options.keep_history > 0)
{
sqlquery_snprintf(sqlquery,
@@ -674,14 +750,15 @@ do_cluster_cleanup(void)
"TRUNCATE TABLE %s.repl_monitor",
get_repmgr_schema_quoted(master_conn));
}
+
res = PQexec(master_conn, sqlquery);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
- log_err(_("cluster cleanup: Couldn't clean history\n%s\n"),
+ log_err(_("cluster cleanup: unable to delete monitoring records\n%s\n"),
PQerrorMessage(master_conn));
PQclear(res);
PQfinish(master_conn);
- exit(ERR_BAD_CONFIG);
+ exit(ERR_DB_QUERY);
}
PQclear(res);
@@ -696,6 +773,15 @@ do_cluster_cleanup(void)
PQclear(res);
PQfinish(master_conn);
+
+ if (runtime_options.keep_history > 0)
+ {
+ log_info(_("cluster cleanup: monitoring records older than %i day(s) deleted\n"), runtime_options.keep_history);
+ }
+ else
+ {
+ log_info(_("cluster cleanup: all monitoring records deleted\n"));
+ }
}
@@ -708,6 +794,8 @@ do_master_register(void)
bool schema_exists = false;
int ret;
+ int primary_node_id = UNKNOWN_NODE_ID;
+
bool record_created;
conn = establish_db_connection(options.conninfo, true);
@@ -717,7 +805,7 @@ do_master_register(void)
check_server_version(conn, "master", true, NULL);
/* Check we are a master */
- log_info(_("connected to master, checking its state\n"));
+ log_verbose(LOG_INFO, _("connected to master, checking its state\n"));
ret = is_standby(conn);
if (ret)
@@ -764,19 +852,29 @@ do_master_register(void)
PQfinish(master_conn);
- /* XXX we should check if a node with a different ID is registered as
- master, otherwise it would be possible to insert a duplicate record
- with --force, which would result in an unwelcome "multi-master" situation
+ begin_transaction(conn);
+
+ /*
+ * Check if a node with a different ID is registered as primary. This shouldn't
+ * happen but could do if an existing master was shut down without being
+ * unregistered.
*/
+ primary_node_id = get_master_node_id(conn, options.cluster_name);
+ if (primary_node_id != NODE_NOT_FOUND && primary_node_id != options.node)
+ {
+ log_err(_("another node with id %i is already registered as master\n"), primary_node_id);
+ rollback_transaction(conn);
+ PQfinish(conn);
+ exit(ERR_BAD_CONFIG);
+ }
+
/* Delete any existing record for this node if --force set */
if (runtime_options.force)
{
PGresult *res;
bool node_record_deleted;
- begin_transaction(conn);
-
res = get_node_record(conn, options.cluster_name, options.node);
if (PQntuples(res))
{
@@ -793,7 +891,6 @@ do_master_register(void)
}
}
- commit_transaction(conn);
}
@@ -811,10 +908,13 @@ do_master_register(void)
if (record_created == false)
{
+ rollback_transaction(conn);
PQfinish(conn);
exit(ERR_DB_QUERY);
}
+ commit_transaction(conn);
+
/* Log the event */
create_event_record(conn,
&options,
@@ -870,7 +970,7 @@ do_standby_register(void)
NULL, NULL);
if (!master_conn)
{
- log_err(_("a master must be defined before configuring a slave\n"));
+ log_err(_("a master must be defined before configuring a standby\n"));
exit(ERR_BAD_CONFIG);
}
@@ -913,9 +1013,10 @@ do_standby_register(void)
{
if (!runtime_options.force)
{
- log_notice(_("HINT: use option -F/--force to overwrite an existing node record\n"));
+ log_hint(_("use option -F/--force to overwrite an existing node record\n"));
}
+ // XXX log registration failure?
PQfinish(master_conn);
PQfinish(conn);
exit(ERR_BAD_CONFIG);
@@ -977,7 +1078,7 @@ do_standby_unregister(void)
NULL, NULL);
if (!master_conn)
{
- log_err(_("a master must be defined before unregistering a slave\n"));
+ log_err(_("a master must be defined before unregistering a standby\n"));
exit(ERR_BAD_CONFIG);
}
@@ -1084,7 +1185,7 @@ do_standby_clone(void)
upstream_conn = establish_db_connection_by_params(keywords, values, true);
/* Verify that upstream node is a supported server version */
- log_info(_("connected to upstream node, checking its state\n"));
+ log_verbose(LOG_INFO, _("connected to upstream node, checking its state\n"));
server_version_num = check_server_version(upstream_conn, "master", true, NULL);
check_upstream_config(upstream_conn, server_version_num, true);
@@ -1261,9 +1362,10 @@ do_standby_clone(void)
strncpy(local_config_file, master_config_file, MAXFILENAME);
strncpy(local_hba_file, master_hba_file, MAXFILENAME);
strncpy(local_ident_file, master_ident_file, MAXFILENAME);
- }
- log_notice(_("starting backup...\n"));
+ log_notice(_("setting data directory to: %s\n"), local_data_directory);
+ log_hint(_("use -D/--data-dir to explicitly specify a data directory\n"));
+ }
/*
* When using rsync only, we need to check the SSH connection early
@@ -1286,12 +1388,32 @@ do_standby_clone(void)
{
log_err(_("unable to use directory %s ...\n"),
local_data_directory);
- log_notice(_("HINT: Use -F/--force option to force this directory to be overwritten\n"));
+ log_hint(_("use -F/--force option to force this directory to be overwritten\n"));
r = ERR_BAD_CONFIG;
retval = ERR_BAD_CONFIG;
goto stop_backup;
}
+ /*
+ * If replication slots requested, create appropriate slot on
+ * the primary; this must be done before pg_start_backup() is
+ * issued, either by us or by pg_basebackup.
+ */
+ if (options.use_replication_slots)
+ {
+ if (create_replication_slot(upstream_conn, repmgr_slot_name) == false)
+ {
+ PQfinish(upstream_conn);
+ exit(ERR_DB_QUERY);
+ }
+ }
+
+ log_notice(_("starting backup...\n"));
+ if (runtime_options.fast_checkpoint == false)
+ {
+ log_hint(_("this may take some time; consider using the -c/--fast-checkpoint option\n"));
+ }
+
if (runtime_options.rsync_only)
{
PQExpBufferData tablespace_map;
@@ -1304,13 +1426,14 @@ do_standby_clone(void)
}
/*
- * From pg 9.1 default is to wait for a sync standby to ack, avoid that by
+ * From 9.1 default is to wait for a sync standby to ack, avoid that by
* turning off sync rep for this session
*/
if (set_config_bool(upstream_conn, "synchronous_commit", false) == false)
{
- PQfinish(upstream_conn);
- exit(ERR_BAD_CONFIG);
+ r = ERR_BAD_CONFIG;
+ retval = ERR_BAD_CONFIG;
+ goto stop_backup;
}
if (start_backup(upstream_conn, first_wal_segment, runtime_options.fast_checkpoint) == false)
@@ -1356,13 +1479,16 @@ do_standby_clone(void)
" WHERE spcname NOT IN ('pg_default', 'pg_global')");
res = PQexec(upstream_conn, sqlquery);
+
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_err(_("unable to execute tablespace query: %s\n"),
PQerrorMessage(upstream_conn));
+
PQclear(res);
- PQfinish(upstream_conn);
- exit(ERR_BAD_CONFIG);
+
+ r = retval = ERR_DB_QUERY;
+ goto stop_backup;
}
for (i = 0; i < PQntuples(res); i++)
@@ -1438,22 +1564,29 @@ do_standby_clone(void)
if (unlink(tblspc_symlink.data) < 0 && errno != ENOENT)
{
log_err(_("unable to remove tablespace symlink %s\n"), tblspc_symlink.data);
- exit(ERR_BAD_CONFIG);
+
+ PQclear(res);
+
+ r = retval = ERR_BAD_BASEBACKUP;
+ goto stop_backup;
}
+
if (symlink(tblspc_dir_dst.data, tblspc_symlink.data) < 0)
{
log_err(_("unable to create tablespace symlink from %s to %s\n"), tblspc_symlink.data, tblspc_dir_dst.data);
- exit(ERR_BAD_CONFIG);
- }
+ PQclear(res);
+ r = retval = ERR_BAD_BASEBACKUP;
+ goto stop_backup;
+ }
}
-
}
}
+ PQclear(res);
- if(server_version_num >= 90500 && tablespace_map_rewrite == true)
+ if (server_version_num >= 90500 && tablespace_map_rewrite == true)
{
PQExpBufferData tablespace_map_filename;
FILE *tablespace_map_file;
@@ -1466,27 +1599,30 @@ do_standby_clone(void)
if (unlink(tablespace_map_filename.data) < 0 && errno != ENOENT)
{
log_err(_("unable to remove tablespace_map file %s\n"), tablespace_map_filename.data);
- exit(ERR_BAD_CONFIG);
+
+ r = retval = ERR_BAD_BASEBACKUP;
+ goto stop_backup;
}
tablespace_map_file = fopen(tablespace_map_filename.data, "w");
if (tablespace_map_file == NULL)
{
log_err(_("unable to create tablespace_map file '%s'\n"), tablespace_map_filename.data);
- exit(ERR_BAD_CONFIG);
+
+ r = retval = ERR_BAD_BASEBACKUP;
+ goto stop_backup;
}
if (fputs(tablespace_map.data, tablespace_map_file) == EOF)
{
log_err(_("unable to write to tablespace_map file '%s'\n"), tablespace_map_filename.data);
- fclose(tablespace_map_file);
- exit(ERR_BAD_CONFIG);
+
+ r = retval = ERR_BAD_BASEBACKUP;
+ goto stop_backup;
}
fclose(tablespace_map_file);
}
-
- PQclear(res);
}
else
{
@@ -1494,6 +1630,7 @@ do_standby_clone(void)
if (r != 0)
{
log_warning(_("standby clone: base backup failed\n"));
+
retval = ERR_BAD_BASEBACKUP;
goto stop_backup;
}
@@ -1611,20 +1748,34 @@ stop_backup:
/* If the backup failed then exit */
if (r != 0)
{
+ /* If a replication slot was previously created, drop it */
+ if (options.use_replication_slots)
+ {
+ drop_replication_slot(upstream_conn, repmgr_slot_name);
+ }
+
log_err(_("unable to take a base backup of the master server\n"));
log_warning(_("destination directory (%s) may need to be cleaned up manually\n"),
local_data_directory);
+
PQfinish(upstream_conn);
exit(retval);
}
+
/*
- * Remove existing WAL from the target directory, since
- * rsync's --exclude option doesn't do it.
+ * Clean up any $PGDATA subdirectories which may contain
+ * files which won't be removed by rsync and which could
+ * be stale or are otherwise not required
*/
- if (runtime_options.force)
+ if (runtime_options.rsync_only && runtime_options.force)
{
char script[MAXLEN];
+
+ /*
+ * Remove any existing WAL from the target directory, since
+ * rsync's --exclude option doesn't do it.
+ */
maxlen_snprintf(script, "rm -rf %s/pg_xlog/*",
local_data_directory);
r = system(script);
@@ -1634,25 +1785,34 @@ stop_backup:
local_data_directory);
exit(ERR_BAD_RSYNC);
}
- }
- /* Finally, write the recovery.conf file */
- create_recovery_file(local_data_directory);
+ /*
+ * Remove any replication slot directories; this matches the
+ * behaviour a base backup, which would result in an empty
+ * pg_replslot directory.
+ *
+ * NOTE: watch out for any changes in the replication
+ * slot directory name (as of 9.4: "pg_replslot") and
+ * functionality of replication slots
+ */
- /*
- * If replication slots requested, create appropriate slot on the primary;
- * create_recovery_file() will already have written `primary_slot_name` into
- * `recovery.conf`
- */
- if (options.use_replication_slots)
- {
- if (create_replication_slot(upstream_conn, repmgr_slot_name) == false)
+ if (server_version_num >= 90400)
{
- PQfinish(upstream_conn);
- exit(ERR_DB_QUERY);
+ maxlen_snprintf(script, "rm -rf %s/pg_replslot/*",
+ local_data_directory);
+ r = system(script);
+ if (r != 0)
+ {
+ log_err(_("unable to empty replication slot directory %s/pg_replslot/\n"),
+ local_data_directory);
+ exit(ERR_BAD_RSYNC);
+ }
}
}
+ /* Finally, write the recovery.conf file */
+ create_recovery_file(local_data_directory);
+
if (runtime_options.rsync_only)
{
log_notice(_("standby clone (using rsync) complete\n"));
@@ -1668,15 +1828,15 @@ stop_backup:
* - provide a custom pg_ctl command
*/
- log_notice(_("HINT: you can now start your PostgreSQL server\n"));
+ log_notice(_("you can now start your PostgreSQL server\n"));
if (target_directory_provided)
{
- log_notice(_("for example : pg_ctl -D %s start\n"),
+ log_hint(_("for example : pg_ctl -D %s start\n"),
local_data_directory);
}
else
{
- log_notice(_("for example : /etc/init.d/postgresql start\n"));
+ log_hint(_("for example : /etc/init.d/postgresql start\n"));
}
/* Log the event */
@@ -1733,7 +1893,7 @@ do_standby_promote(void)
conn = establish_db_connection(options.conninfo, true);
/* Verify that standby is a supported server version */
- log_info(_("connected to standby, checking its state\n"));
+ log_verbose(LOG_INFO, _("connected to standby, checking its state\n"));
check_server_version(conn, "standby", true, NULL);
@@ -1828,7 +1988,7 @@ do_standby_promote(void)
create_event_record(NULL,
&options,
options.node,
- "repmgrd_failover_promote",
+ "standby_promote",
false,
details.data);
@@ -1841,7 +2001,7 @@ do_standby_promote(void)
"Node %i was successfully promoted to master",
options.node);
- log_notice(_("STANDBY PROMOTE successful. You should REINDEX any hash indexes you have.\n"));
+ log_notice(_("STANDBY PROMOTE successful\n"));
/* Log the event */
create_event_record(conn,
@@ -1877,7 +2037,7 @@ do_standby_follow(void)
/* We need to connect to check configuration */
log_info(_("connecting to standby database\n"));
conn = establish_db_connection(options.conninfo, true);
- log_info(_("connected to standby, checking its state\n"));
+ log_verbose(LOG_INFO, _("connected to standby, checking its state\n"));
/* Check we are in a standby node */
retval = is_standby(conn);
@@ -1942,6 +2102,39 @@ do_standby_follow(void)
strncpy(runtime_options.masterport, PQport(master_conn), MAXLEN);
strncpy(runtime_options.username, PQuser(master_conn), MAXLEN);
+ /*
+ * If 9.4 or later, and replication slots in use, we'll need to create a
+ * slot on the new master
+ */
+
+ if (options.use_replication_slots)
+ {
+ if (create_replication_slot(master_conn, repmgr_slot_name) == false)
+ {
+ PQExpBufferData event_details;
+ initPQExpBuffer(&event_details);
+
+ appendPQExpBuffer(&event_details,
+ _("Unable to create slot '%s' on the master node: %s"),
+ repmgr_slot_name,
+ PQerrorMessage(master_conn));
+
+ log_err("%s\n", event_details.data);
+
+ create_event_record(master_conn,
+ &options,
+ options.node,
+ "repmgr_follow",
+ false,
+ event_details.data);
+
+ PQfinish(conn);
+ PQfinish(master_conn);
+ exit(ERR_DB_QUERY);
+ }
+ }
+
+
log_info(_("changing standby's master\n"));
/* Get the data directory full path */
@@ -2146,7 +2339,7 @@ do_witness_create(void)
exit(ERR_BAD_CONFIG);
}
- xsnprintf(buf, sizeof(buf), "\n#Configuration added by %s\n", progname);
+ xsnprintf(buf, sizeof(buf), "\n#Configuration added by %s\n", progname());
fputs(buf, pg_conf);
@@ -2437,49 +2630,62 @@ do_witness_create(void)
static void
-help(const char *progname)
+help(void)
{
- printf(_("%s: replication management tool for PostgreSQL\n"), progname);
+ printf(_("%s: replication management tool for PostgreSQL\n"), progname());
printf(_("\n"));
printf(_("Usage:\n"));
- printf(_(" %s [OPTIONS] master {register}\n"), progname);
+ printf(_(" %s [OPTIONS] master register\n"), progname());
printf(_(" %s [OPTIONS] standby {register|unregister|clone|promote|follow}\n"),
- progname);
- printf(_(" %s [OPTIONS] cluster {show|cleanup}\n"), progname);
+ progname());
+ printf(_(" %s [OPTIONS] cluster {show|cleanup}\n"), progname());
printf(_("\n"));
printf(_("General options:\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_(" -V, --version output version information, then exit\n"));
- printf(_(" -v, --verbose output verbose activity information\n"));
- printf(_("\nConnection options:\n"));
+ printf(_("\n"));
+ printf(_("Logging options:\n"));
+ printf(_(" -L, --log-level set log level (overrides configuration file)\n"));
+ printf(_(" -v, --verbose display additional log output (useful for debugging)\n"));
+ printf(_(" -t, --terse don't display hints and other non-critical output\n"));
+ printf(_("\n"));
+ printf(_("Connection options:\n"));
printf(_(" -d, --dbname=DBNAME database to connect to\n"));
printf(_(" -h, --host=HOSTNAME database server host or socket directory\n"));
printf(_(" -p, --port=PORT database server port\n"));
printf(_(" -U, --username=USERNAME database user name to connect as\n"));
- printf(_("\nConfiguration options:\n"));
+ printf(_("\n"));
+ printf(_("General configuration options:\n"));
printf(_(" -b, --pg_bindir=PATH path to PostgreSQL binaries (optional)\n"));
printf(_(" -D, --data-dir=DIR local directory where the files will be\n" \
" copied to\n"));
printf(_(" -f, --config-file=PATH path to the configuration file\n"));
printf(_(" -R, --remote-user=USERNAME database server username for rsync\n"));
- printf(_(" -S, --superuser=USERNAME superuser username for witness database\n" \
- " (default: postgres)\n"));
-/* remove this line in the next significant release */
- printf(_(" -l, --local-port=PORT (DEPRECATED) witness server local port (default: %s)\n"), WITNESS_DEFAULT_PORT);
- printf(_(" -w, --wal-keep-segments=VALUE minimum value for the GUC\n" \
- " wal_keep_segments (default: %s)\n"), DEFAULT_WAL_KEEP_SEGMENTS);
- printf(_(" -k, --keep-history=VALUE keeps indicated number of days of history\n"));
printf(_(" -F, --force force potentially dangerous operations to happen\n"));
- printf(_(" -W, --wait wait for a master to appear\n"));
- printf(_(" -r, --rsync-only use only rsync to clone a standby\n"));
- printf(_(" -c, --fast-checkpoint force fast checkpoint when cloning a standby\n"));
- printf(_(" --recovery-min-apply-delay=VALUE set recovery_min_apply_delay in recovery.conf\n" \
- " when cloning a standby (PostgreSQL 9.4 and later)\n"));
- printf(_(" --ignore-external-config-files don't copy configuration files located outside \n" \
- " the data directory when cloning a standby\n"));
- printf(_(" --initdb-no-pwprompt don't require superuser password when running initdb\n"));
printf(_(" --check-upstream-config verify upstream server configuration\n"));
- printf(_("\n%s performs the following node management tasks:\n\n"), progname);
+ printf(_("\n"));
+ printf(_("Command-specific configuration options:\n"));
+ printf(_(" -c, --fast-checkpoint (standby clone) force fast checkpoint\n"));
+ printf(_(" -r, --rsync-only (standby clone) use only rsync, not pg_basebackup\n"));
+ printf(_(" --recovery-min-apply-delay=VALUE (standby clone, follow) set recovery_min_apply_delay\n" \
+ " in recovery.conf (PostgreSQL 9.4 and later)\n"));
+ printf(_(" --ignore-external-config-files (standby clone) don't copy configuration files located\n" \
+ " outside the data directory when cloning a standby\n"));
+ printf(_(" -w, --wal-keep-segments=VALUE (standby clone) minimum value for the GUC\n" \
+ " wal_keep_segments (default: %s)\n"), DEFAULT_WAL_KEEP_SEGMENTS);
+ printf(_(" -W, --wait (standby follow) wait for a master to appear\n"));
+ printf(_(" -k, --keep-history=VALUE (cluster cleanup) retain indicated number of days of history\n"));
+
+
+ printf(_(" --initdb-no-pwprompt (witness server) no superuser password prompt during initdb\n"));
+/* remove this line in the next significant release */
+ printf(_(" -l, --local-port=PORT (witness server) witness server local port, default: %s \n" \
+ " (DEPRECATED, put port in conninfo)\n"), WITNESS_DEFAULT_PORT);
+ printf(_(" -S, --superuser=USERNAME (witness server) superuser username for witness database\n" \
+ " (default: postgres)\n"));
+ printf(_("\n"));
+ printf(_("%s performs the following node management tasks:\n"), progname());
+ printf(_("\n"));
printf(_("COMMANDS:\n"));
printf(_(" master register - registers the master in a cluster\n"));
printf(_(" standby clone [node] - creates a new standby\n"));
@@ -2795,11 +3001,11 @@ check_parameters_for_action(const int action)
if (runtime_options.host[0] || runtime_options.masterport[0] ||
runtime_options.username[0] || runtime_options.dbname[0])
{
- error_list_append(_("master connection parameters not required when executing MASTER REGISTER"));
+ error_list_append(&cli_warnings, _("master connection parameters not required when executing MASTER REGISTER"));
}
if (runtime_options.dest_dir[0])
{
- error_list_append(_("destination directory not required when executing MASTER REGISTER"));
+ error_list_append(&cli_warnings, _("destination directory not required when executing MASTER REGISTER"));
}
break;
case STANDBY_REGISTER:
@@ -2812,11 +3018,11 @@ check_parameters_for_action(const int action)
if (runtime_options.host[0] || runtime_options.masterport[0] ||
runtime_options.username[0] || runtime_options.dbname[0])
{
- error_list_append(_("master connection parameters not required when executing STANDBY REGISTER"));
+ error_list_append(&cli_warnings, _("master connection parameters not required when executing STANDBY REGISTER"));
}
if (runtime_options.dest_dir[0])
{
- error_list_append(_("destination directory not required when executing STANDBY REGISTER"));
+ error_list_append(&cli_warnings, _("destination directory not required when executing STANDBY REGISTER"));
}
break;
case STANDBY_UNREGISTER:
@@ -2829,11 +3035,11 @@ check_parameters_for_action(const int action)
if (runtime_options.host[0] || runtime_options.masterport[0] ||
runtime_options.username[0] || runtime_options.dbname[0])
{
- error_list_append(_("master connection parameters not required when executing STANDBY UNREGISTER"));
+ error_list_append(&cli_warnings, _("master connection parameters not required when executing STANDBY UNREGISTER"));
}
if (runtime_options.dest_dir[0])
{
- error_list_append(_("destination directory not required when executing STANDBY UNREGISTER"));
+ error_list_append(&cli_warnings, _("destination directory not required when executing STANDBY UNREGISTER"));
}
break;
case STANDBY_PROMOTE:
@@ -2847,11 +3053,11 @@ check_parameters_for_action(const int action)
if (runtime_options.host[0] || runtime_options.masterport[0] ||
runtime_options.username[0] || runtime_options.dbname[0])
{
- error_list_append(_("master connection parameters not required when executing STANDBY PROMOTE"));
+ error_list_append(&cli_warnings, _("master connection parameters not required when executing STANDBY PROMOTE"));
}
if (runtime_options.dest_dir[0])
{
- error_list_append(_("destination directory not required when executing STANDBY PROMOTE"));
+ error_list_append(&cli_warnings, _("destination directory not required when executing STANDBY PROMOTE"));
}
break;
case STANDBY_FOLLOW:
@@ -2865,11 +3071,11 @@ check_parameters_for_action(const int action)
if (runtime_options.host[0] || runtime_options.masterport[0] ||
runtime_options.username[0] || runtime_options.dbname[0])
{
- error_list_append(_("master connection parameters not required when executing STANDBY FOLLOW"));
+ error_list_append(&cli_warnings, _("master connection parameters not required when executing STANDBY FOLLOW"));
}
if (runtime_options.dest_dir[0])
{
- error_list_append(_("destination directory not required when executing STANDBY FOLLOW"));
+ error_list_append(&cli_warnings, _("destination directory not required when executing STANDBY FOLLOW"));
}
break;
case STANDBY_CLONE:
@@ -2882,17 +3088,17 @@ check_parameters_for_action(const int action)
if (strcmp(runtime_options.host, "") == 0)
{
- error_list_append(_("master hostname (-h/--host) required when executing STANDBY CLONE"));
+ error_list_append(&cli_errors, _("master hostname (-h/--host) required when executing STANDBY CLONE"));
}
if (strcmp(runtime_options.dbname, "") == 0)
{
- error_list_append(_("master database name (-d/--dbname) required when executing STANDBY CLONE"));
+ error_list_append(&cli_errors, _("master database name (-d/--dbname) required when executing STANDBY CLONE"));
}
if (strcmp(runtime_options.username, "") == 0)
{
- error_list_append(_("master database username (-U/--username) required when executing STANDBY CLONE"));
+ error_list_append(&cli_errors, _("master database username (-U/--username) required when executing STANDBY CLONE"));
}
config_file_required = false;
@@ -2908,26 +3114,32 @@ check_parameters_for_action(const int action)
break;
}
+ /* Warn about parameters which apply to STANDBY CLONE only */
if (action != STANDBY_CLONE)
{
- if (runtime_options.rsync_only)
- {
- error_list_append(_("--rsync-only can only be used when executing STANDBY CLONE"));
- }
-
if (runtime_options.fast_checkpoint)
{
- error_list_append(_("--fast-checkpoint can only be used when executing STANDBY CLONE"));
+ error_list_append(&cli_warnings, _("-c/--fast-checkpoint can only be used when executing STANDBY CLONE"));
}
if (runtime_options.ignore_external_config_files)
{
- error_list_append(_("--ignore-external-config-files can only be used when executing STANDBY CLONE"));
+ error_list_append(&cli_warnings, _("--ignore-external-config-files can only be used when executing STANDBY CLONE"));
}
if (*runtime_options.recovery_min_apply_delay)
{
- error_list_append(_("--recovery-min-apply-delay can only be used when executing STANDBY CLONE"));
+ error_list_append(&cli_warnings, _("--recovery-min-apply-delay can only be used when executing STANDBY CLONE"));
+ }
+
+ if (runtime_options.rsync_only)
+ {
+ error_list_append(&cli_warnings, _("-r/--rsync-only can only be used when executing STANDBY CLONE"));
+ }
+
+ if (wal_keep_segments_used)
+ {
+ error_list_append(&cli_warnings, _("-w/--wal-keep-segments can only be used when executing STANDBY CLONE"));
}
}
@@ -3214,11 +3426,6 @@ write_primary_conninfo(char *line)
{
maxlen_snprintf(password_buf, " password=%s", password);
}
- else if (require_password)
- {
- log_err(_("password required but none provided and PGPASSWORD not set\n"));
- exit(ERR_BAD_PASSWORD);
- }
if (runtime_options.host[0])
{
@@ -3273,7 +3480,7 @@ check_server_version(PGconn *conn, char *server_type, bool exit_on_error, char *
{
if (server_version_num > 0)
log_err(_("%s requires %s to be PostgreSQL %s or later\n"),
- progname,
+ progname(),
server_type,
MIN_SUPPORTED_VERSION
);
@@ -3414,7 +3621,7 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
if (i == 0)
{
log_err(_("parameter 'max_replication_slots' must be set to at least 1 to enable replication slots\n"));
- log_notice(_("HINT: 'max_replication_slots' should be set to at least the number of expected standbys\n"));
+ log_hint(_("'max_replication_slots' should be set to at least the number of expected standbys\n"));
if (exit_on_error == true)
{
PQfinish(conn);
@@ -3443,7 +3650,7 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
runtime_options.wal_keep_segments);
if (server_version_num >= 90400)
{
- log_notice(_("HINT: in PostgreSQL 9.4 and later, replication slots can be used, which "
+ log_hint(_("in PostgreSQL 9.4 and later, replication slots can be used, which "
"do not require 'wal_keep_segments' to be set to a high value "
"(set parameter 'use_replication_slots' in the configuration file to enable)\n"
));
@@ -3484,7 +3691,6 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
* of what's in 'archive_command', so until 'archive_mode' is on we can't
* properly check it.
*/
-
if (guc_set(conn, "archive_mode", "=", "on"))
{
i = guc_set(conn, "archive_command", "!=", "");
@@ -3505,6 +3711,11 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
}
+ /*
+ * Check that 'hot_standby' is on. This isn't strictly necessary
+ * for the primary server, however the assumption is that configuration
+ * should be consistent for all servers in a cluster.
+ */
i = guc_set(conn, "hot_standby", "=", "on");
if (i == 0 || i == -1)
{
@@ -3526,7 +3737,7 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
if (i == 0)
{
log_err(_("parameter 'max_wal_senders' must be set to be at least 1\n"));
- log_notice(_("HINT: 'max_wal_senders' should be set to at least the number of expected standbys\n"));
+ log_hint(_("'max_wal_senders' should be set to at least the number of expected standbys\n"));
}
if (exit_on_error == true)
@@ -3548,7 +3759,7 @@ update_node_record_set_master(PGconn *conn, int this_node_id)
PGresult *res;
char sqlquery[QUERY_STR_LEN];
- log_debug(_("Setting %i as master and marking existing master as failed\n"), this_node_id);
+ log_debug(_("setting node %i as master and marking existing master as failed\n"), this_node_id);
begin_transaction(conn);
@@ -3626,7 +3837,7 @@ do_check_upstream_config(void)
conn = establish_db_connection_by_params(keywords, values, true);
/* Verify that upstream server is a supported server version */
- log_info(_("connected to upstream server, checking its state\n"));
+ log_verbose(LOG_INFO, _("connected to upstream server, checking its state\n"));
server_version_num = check_server_version(conn, "upstream server", false, NULL);
config_ok = check_upstream_config(conn, server_version_num, false);
@@ -3650,46 +3861,35 @@ make_pg_path(char *file)
static void
-error_list_append(char *error_message)
+exit_with_errors(void)
{
- ErrorListCell *cell;
-
- cell = (ErrorListCell *) pg_malloc0(sizeof(ErrorListCell));
+ fprintf(stderr, _("%s: following command line errors were encountered.\n"), progname());
- if (cell == NULL)
- {
- log_err(_("unable to allocate memory; terminating.\n"));
- exit(ERR_BAD_CONFIG);
- }
+ print_error_list(&cli_errors, LOG_ERR);
- cell->error_message = error_message;
+ fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname());
- if (cli_errors.tail)
- {
- cli_errors.tail->next = cell;
- }
- else
- {
- cli_errors.head = cell;
- }
-
- cli_errors.tail = cell;
+ exit(ERR_BAD_CONFIG);
}
static void
-exit_with_errors(void)
+print_error_list(ErrorList *error_list, int log_level)
{
ErrorListCell *cell;
- fprintf(stderr, _("%s: Replication manager \n"), progname);
-
- for (cell = cli_errors.head; cell; cell = cell->next)
+ for (cell = error_list->head; cell; cell = cell->next)
{
- fprintf(stderr, "[ERROR] %s\n", cell->error_message);
- }
-
- fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
+ switch(log_level)
+ {
+ /* Currently we only need errors and warnings */
+ case LOG_ERR:
+ log_err("%s\n", cell->error_message);
+ break;
+ case LOG_WARNING:
+ log_warning("%s\n", cell->error_message);
+ break;
+ }
- exit(ERR_BAD_CONFIG);
+ }
}
diff --git a/repmgr.conf.sample b/repmgr.conf.sample
index ca61fbe..bd0e5e8 100644
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -16,11 +16,15 @@ cluster=example_cluster
# Node ID and name
# (Note: we recommend to avoid naming nodes after their initial
# replication funcion, as this will cause confusion when e.g.
-# "standby2" is promoted to master)
-node=2
-node_name=node2
-
-# Database connection information
+# "standby2" is promoted to primary)
+node=2 # a unique integer
+node_name=node2 # an arbitrary (but unique) string; we recommend using
+ # the server's hostname or another identifier unambiguously
+ # associated with the server to avoid confusion
+
+# Database connection information as a conninfo string
+# This must be accessible to all servers in the cluster; for details see:
+# http://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING
conninfo='host=192.168.204.104 dbname=repmgr_db user=repmgr_usr'
# Optional configuration items
@@ -32,7 +36,7 @@ conninfo='host=192.168.204.104 dbname=repmgr_db user=repmgr_usr'
# when using cascading replication and a standby is to be connected to an
# upstream standby, specify that node's ID with 'upstream_node'. The node
# must exist before the new standby can be registered. If a standby is
-# to connect directly to a master node, this parameter is not required.
+# to connect directly to a primary node, this parameter is not required.
#
# upstream_node=1
@@ -40,7 +44,9 @@ conninfo='host=192.168.204.104 dbname=repmgr_db user=repmgr_usr'
# (default: 0)
#
# use_replication_slots=0
-
+#
+# NOTE: 'max_replication_slots' should be configured for at least the
+# number of standbys which will connect to the primary.
# Logging and monitoring settings
# -------------------------------
@@ -110,28 +116,29 @@ logfacility=STDERR
#
# These settings are only applied when repmgrd is running.
-# How many seconds we wait for master response before declaring master failure
+# Number of seconds to wait for a response from the primary server before
+# deciding it has failed
+
master_response_timeout=60
-# How many time we try to reconnect to master before starting failover procedure
+# Number of times to try and reconnect to the primary before starting
+# the failover procedure
reconnect_attempts=6
reconnect_interval=10
# Autofailover options
failover=automatic # one of 'automatic', 'manual'
-priority=100 # a value of zero or less prevents the node being promoted to master
+priority=100 # a value of zero or less prevents the node being promoted to primary
promote_command='repmgr standby promote -f /path/to/repmgr.conf'
follow_command='repmgr standby follow -f /path/to/repmgr.conf -W'
-# monitoring interval; default is 2s
+# monitoring interval in seconds; default is 2
#
# monitor_interval_secs=2
-# change wait time for master; before we bail out and exit when the master
+# change wait time for primary; before we bail out and exit when the primary
# disappears, we wait 'reconnect_attempts' * 'retry_promote_interval_secs'
# seconds; by default this would be half an hour, as 'retry_promote_interval_secs'
# default value is 300)
#
# retry_promote_interval_secs=300
-
-
diff --git a/repmgr.h b/repmgr.h
index 5fa9e97..c5d2ecf 100644
--- a/repmgr.h
+++ b/repmgr.h
@@ -36,7 +36,6 @@
#define MAXFILENAME 1024
#define ERRBUFF_SIZE 512
-#define DEFAULT_CONFIG_FILE "./repmgr.conf"
#define DEFAULT_WAL_KEEP_SEGMENTS "5000"
#define DEFAULT_DEST_DIR "."
#define DEFAULT_MASTER_PORT "5432"
@@ -49,14 +48,7 @@
#define AUTOMATIC_FAILOVER 1
#define NODE_NOT_FOUND -1
#define NO_UPSTREAM_NODE -1
-
-
-typedef enum {
- UNKNOWN = 0,
- MASTER,
- STANDBY,
- WITNESS
-} t_server_type;
+#define UNKNOWN_NODE_ID -1
@@ -73,6 +65,7 @@ typedef struct
char superuser[MAXLEN];
char wal_keep_segments[MAXLEN];
bool verbose;
+ bool terse;
bool force;
bool wait_for_master;
bool ignore_rsync_warn;
@@ -82,6 +75,7 @@ typedef struct
bool ignore_external_config_files;
char masterport[MAXLEN];
char localport[MAXLEN];
+ char loglevel[MAXLEN];
/* parameter used by CLUSTER CLEANUP */
int keep_history;
@@ -91,20 +85,9 @@ typedef struct
char recovery_min_apply_delay[MAXLEN];
} t_runtime_options;
-#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, false, false, "", "", 0, "", "" }
+#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, false, false, false, "", "", "", 0, "", "" }
extern char repmgr_schema[MAXLEN];
-typedef struct ErrorListCell
-{
- struct ErrorListCell *next;
- char *error_message;
-} ErrorListCell;
-
-typedef struct ErrorList
-{
- ErrorListCell *head;
- ErrorListCell *tail;
-} ErrorList;
#endif
diff --git a/repmgrd.c b/repmgrd.c
index 38f5410..21271a1 100644
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -41,22 +41,6 @@
#include "access/xlogdefs.h"
#include "pqexpbuffer.h"
-/*
- * Struct to store node information
- */
-typedef struct s_node_info
-{
- int node_id;
- int upstream_node_id;
- char conninfo_str[MAXLEN];
- XLogRecPtr xlog_location;
- t_server_type type;
- bool is_ready;
- bool is_visible;
- char slot_name[MAXLEN];
- bool active;
-} t_node_info;
-
/* Local info */
@@ -68,9 +52,7 @@ t_configuration_options master_options;
PGconn *master_conn = NULL;
-const char *progname;
-
-char *config_file = DEFAULT_CONFIG_FILE;
+char *config_file = "";
bool verbose = false;
bool monitoring_history = false;
t_node_info node_info;
@@ -81,7 +63,7 @@ char *pid_file = NULL;
t_configuration_options config = T_CONFIGURATION_OPTIONS_INITIALIZER;
-static void help(const char *progname);
+static void help(void);
static void usage(void);
static void check_cluster_configuration(PGconn *conn);
static void check_node_configuration(void);
@@ -89,7 +71,7 @@ static void check_node_configuration(void);
static void standby_monitor(void);
static void witness_monitor(void);
static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
-static bool set_local_node_failed(void);
+static bool set_local_node_status(void);
static void update_shared_memory(char *last_wal_standby_applied);
static void update_registration(void);
@@ -158,9 +140,10 @@ main(int argc, char **argv)
FILE *fd;
int server_version_num = 0;
- progname = get_progname(argv[0]);
- while ((c = getopt_long(argc, argv, "?Vf:v:mdp:", long_options, &optindex)) != -1)
+ set_progname(argv[0]);
+
+ while ((c = getopt_long(argc, argv, "?Vf:vmdp:", long_options, &optindex)) != -1)
{
switch (c)
{
@@ -180,10 +163,10 @@ main(int argc, char **argv)
pid_file = optarg;
break;
case '?':
- help(progname);
+ help();
exit(SUCCESS);
case 'V':
- printf("%s %s (PostgreSQL %s)\n", progname, REPMGR_VERSION, PG_VERSION);
+ printf("%s %s (PostgreSQL %s)\n", progname(), REPMGR_VERSION, PG_VERSION);
exit(SUCCESS);
default:
usage();
@@ -200,7 +183,7 @@ main(int argc, char **argv)
* which case we'll need to refactor parse_config() not to abort,
* and return the error message.
*/
- load_config(config_file, &local_options, argv[0]);
+ load_config(config_file, verbose, &local_options, argv[0]);
if (daemonize)
{
@@ -230,10 +213,9 @@ main(int argc, char **argv)
strerror(errno));
}
- logger_init(&local_options, progname, local_options.loglevel,
- local_options.logfacility);
+ logger_init(&local_options, progname());
if (verbose)
- logger_min_verbose(LOG_INFO);
+ logger_set_verbose();
if (log_type == REPMGR_SYSLOG)
{
@@ -247,6 +229,7 @@ main(int argc, char **argv)
}
/* Initialise the repmgr schema name */
+ /* XXX check this handles quoting properly */
maxlen_snprintf(repmgr_schema, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX,
local_options.cluster_name);
@@ -264,7 +247,7 @@ main(int argc, char **argv)
if (server_version_num > 0)
{
log_err(_("%s requires PostgreSQL %s or later\n"),
- progname,
+ progname(),
MIN_SUPPORTED_VERSION) ;
}
else
@@ -282,7 +265,7 @@ main(int argc, char **argv)
if (node_info.node_id == NODE_NOT_FOUND)
{
log_err(_("No metadata record found for this node - terminating\n"));
- log_notice(_("HINT: was this node registered with 'repmgr (master|standby) register'?\n"));
+ log_hint(_("Check that 'repmgr (master|standby) register' was executed for this node\n"));
terminate(ERR_BAD_CONFIG);
}
@@ -407,7 +390,7 @@ main(int argc, char **argv)
appendPQExpBuffer(&errmsg,
_("unable to connect to master node '%s'"),
- local_options.cluster_name);
+ master_options.node_name);
log_err("%s\n", errmsg.data);
@@ -457,7 +440,7 @@ main(int argc, char **argv)
do
{
- log_debug("standby check loop...\n");
+ log_verbose(LOG_DEBUG, "standby check loop...\n");
if (node_info.type == WITNESS)
{
@@ -467,6 +450,7 @@ main(int argc, char **argv)
{
standby_monitor();
}
+
sleep(local_options.monitor_interval_secs);
if (got_SIGHUP)
@@ -558,10 +542,10 @@ witness_monitor(void)
{
log_warning(
_("unable to determine a valid master server; waiting %i seconds to retry...\n"),
- local_options.reconnect_intvl
+ local_options.reconnect_interval
);
PQfinish(master_conn);
- sleep(local_options.reconnect_intvl);
+ sleep(local_options.reconnect_interval);
}
else
{
@@ -674,6 +658,7 @@ standby_monitor(void)
char last_wal_standby_received[MAXLEN];
char last_wal_standby_applied[MAXLEN];
char last_wal_standby_applied_timestamp[MAXLEN];
+ bool last_wal_standby_received_gte_replayed;
char sqlquery[QUERY_STR_LEN];
XLogRecPtr lsn_master;
@@ -701,23 +686,16 @@ standby_monitor(void)
{
PQExpBufferData errmsg;
- set_local_node_failed();
+ set_local_node_status();
initPQExpBuffer(&errmsg);
appendPQExpBuffer(&errmsg,
- _("failed to connect to local node, node marked as failed and terminating!"));
+ _("failed to connect to local node, node marked as failed!"));
log_err("%s\n", errmsg.data);
- create_event_record(master_conn,
- &local_options,
- local_options.node,
- "repmgrd_shutdown",
- false,
- errmsg.data);
-
- terminate(ERR_DB_CON);
+ goto continue_monitoring_standby;
}
upstream_conn = get_upstream_connection(my_local_conn,
@@ -738,7 +716,7 @@ standby_monitor(void)
check_connection(&upstream_conn, type, upstream_conninfo);
/*
* This takes up to local_options.reconnect_attempts *
- * local_options.reconnect_intvl seconds
+ * local_options.reconnect_interval seconds
*/
if (PQstatus(upstream_conn) != CONNECTION_OK)
@@ -846,6 +824,7 @@ standby_monitor(void)
PQfinish(upstream_conn);
+ continue_monitoring_standby:
/* Check if we still are a standby, we could have been promoted */
do
{
@@ -861,10 +840,13 @@ standby_monitor(void)
* will require manual resolution as there's no way of determing
* which master is the correct one.
*
+ * We should log a message so the user knows of the situation at hand.
+ *
* XXX check if the original master is still active and display a
* warning
*/
- log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
+ log_err(_("It seems this server was promoted manually (not by repmgr) so you might by in the presence of a split-brain.\n"));
+ log_err(_("Check your cluster and manually fix any anomaly.\n"));
terminate(1);
break;
@@ -874,8 +856,11 @@ standby_monitor(void)
if (!check_connection(&my_local_conn, "standby", NULL))
{
- set_local_node_failed();
- terminate(0);
+ set_local_node_status();
+ /*
+ * Let's continue checking, and if the postgres server on the
+ * standby comes back up, we will activate it again
+ */
}
break;
@@ -884,14 +869,20 @@ standby_monitor(void)
if (did_retry)
{
- log_info(_("standby connection recovered!\n"));
+ /*
+ * There's a possible situation where the standby went down for some reason
+ * (maintenance for example) and is now up and maybe connected once again to
+ * the stream. If we set the local standby node as failed and it's now running
+ * and receiving replication data, we should activate it again.
+ */
+ set_local_node_status();
+ log_info(_("standby connection recovered!\n"));
}
/* Fast path for the case where no history is requested */
if (!monitoring_history)
return;
-
/*
* If original master has gone away we'll need to get the new one
* from the upstream node to write monitoring information
@@ -953,7 +944,8 @@ standby_monitor(void)
/* Get local xlog info */
sqlquery_snprintf(sqlquery,
"SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
- "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp() ");
+ "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp(), "
+ "pg_last_xlog_receive_location() >= pg_last_xlog_replay_location()");
res = PQexec(my_local_conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
@@ -968,10 +960,30 @@ standby_monitor(void)
strncpy(last_wal_standby_received, PQgetvalue(res, 0, 1), MAXLEN);
strncpy(last_wal_standby_applied, PQgetvalue(res, 0, 2), MAXLEN);
strncpy(last_wal_standby_applied_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
+ last_wal_standby_received_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
+ ? true
+ : false;
+
PQclear(res);
+ /*
+ * In the unusual event of a standby becoming disconnected from the primary,
+ * while this repmgrd remains connected to the primary, subtracting
+ * "lsn_standby_applied" from "lsn_standby_received" and coercing to
+ * (long long unsigned int) will result in a meaningless, very large
+ * value which will overflow a BIGINT column and spew error messages into the
+ * PostgreSQL log. In the absence of a better strategy, skip attempting
+ * to insert a monitoring record.
+ */
+ if (last_wal_standby_received_gte_replayed == false)
+ {
+ log_verbose(LOG_WARNING,
+ "Invalid replication_lag value calculated - is this standby connected to its upstream?\n");
+ return;
+ }
+
/* Get master xlog info */
- sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location()");
+ sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_current_xlog_location()");
res = PQexec(master_conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
@@ -998,10 +1010,10 @@ standby_monitor(void)
" last_monitor_time, last_apply_time, "
" last_wal_primary_location, last_wal_standby_location, "
" replication_lag, apply_lag ) "
- " VALUES(%d, %d, "
- " '%s'::TIMESTAMP WITH TIME ZONE, '%s'::TIMESTAMP WITH TIME ZONE, "
- " '%s', '%s', "
- " %llu, %llu) ",
+ " VALUES(%d, %d, "
+ " '%s'::TIMESTAMP WITH TIME ZONE, '%s'::TIMESTAMP WITH TIME ZONE, "
+ " '%s', '%s', "
+ " %llu, %llu) ",
get_repmgr_schema_quoted(master_conn),
master_options.node, local_options.node,
monitor_standby_timestamp, last_wal_standby_applied_timestamp,
@@ -1013,7 +1025,8 @@ standby_monitor(void)
* Execute the query asynchronously, but don't check for a result. We will
* check the result next time we pause for a monitor step.
*/
- log_debug("standby_monitor: %s\n", sqlquery);
+ log_verbose(LOG_DEBUG, "standby_monitor:() %s\n", sqlquery);
+
if (PQsendQuery(master_conn, sqlquery) == 0)
log_warning(_("query could not be sent to master. %s\n"),
PQerrorMessage(master_conn));
@@ -1055,10 +1068,10 @@ do_master_failover(void)
t_node_info nodes[FAILOVER_NODES_MAX_CHECK];
/* Store details of the failed node here */
- t_node_info failed_master = {-1, NO_UPSTREAM_NODE, "", InvalidXLogRecPtr, UNKNOWN, false, false};
+ t_node_info failed_master = T_NODE_INFO_INITIALIZER;
/* Store details of the best candidate for promotion to master here */
- t_node_info best_candidate = {-1, NO_UPSTREAM_NODE, "", InvalidXLogRecPtr, UNKNOWN, false, false};
+ t_node_info best_candidate = T_NODE_INFO_INITIALIZER;
/* get a list of standby nodes, including myself */
sprintf(sqlquery,
@@ -1187,12 +1200,13 @@ do_master_failover(void)
terminate(ERR_FAILOVER_FAIL);
}
- sqlquery_snprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
+ sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
res = PQexec(node_conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_info(_("unable to retrieve node's last standby location: %s\n"),
PQerrorMessage(node_conn));
+
log_debug(_("connection details: %s\n"), nodes[i].conninfo_str);
PQclear(res);
PQfinish(node_conn);
@@ -1218,7 +1232,7 @@ do_master_failover(void)
}
/* last we get info about this node, and update shared memory */
- sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
+ sprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
res = PQexec(my_local_conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
@@ -1284,7 +1298,7 @@ do_master_failover(void)
res = PQexec(node_conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
- log_err(_("PQexec failed: %s.\nReport an invalid value to not"
+ log_err(_("PQexec failed: %s.\nReport an invalid value to not "
"be considered as new master and exit.\n"),
PQerrorMessage(node_conn));
PQclear(res);
@@ -1336,6 +1350,9 @@ do_master_failover(void)
PQclear(res);
/* If position is 0/0, keep checking */
+ /* XXX we should add a timeout here to prevent infinite looping
+ * if the other node's repmgrd is not up
+ */
continue;
}
@@ -1413,8 +1430,7 @@ do_master_failover(void)
/* wait */
sleep(5);
- if (verbose)
- log_info(_("this node is the best candidate to be the new master, promoting...\n"));
+ log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
log_debug(_("promote command is: \"%s\"\n"),
local_options.promote_command);
@@ -1463,10 +1479,8 @@ do_master_failover(void)
/* wait */
sleep(10);
- if (verbose)
- log_info(_("node %d is the best candidate to be the new master, we should follow it...\n"),
- best_candidate.node_id);
- log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command);
+ log_info(_("node %d is the best candidate for new master, attempting to follow...\n"),
+ best_candidate.node_id);
/*
* The new master may some time to be promoted. The follow command
@@ -1477,57 +1491,23 @@ do_master_failover(void)
fflush(stderr);
}
- /*
- * If 9.4 or later, and replication slots in use, we'll need to create a
- * slot on the new master
- */
- new_master_conn = establish_db_connection(best_candidate.conninfo_str, true);
-
- if (local_options.use_replication_slots)
- {
- if (create_replication_slot(new_master_conn, node_info.slot_name) == false)
- {
-
- appendPQExpBuffer(&event_details,
- _("Unable to create slot '%s' on the master node: %s"),
- node_info.slot_name,
- PQerrorMessage(new_master_conn));
- log_err("%s\n", event_details.data);
-
- create_event_record(new_master_conn,
- &local_options,
- node_info.node_id,
- "repmgrd_failover_follow",
- false,
- event_details.data);
-
- PQfinish(new_master_conn);
- terminate(ERR_DB_QUERY);
- }
- }
+ log_debug(_("executing follow command: \"%s\"\n"), local_options.follow_command);
r = system(local_options.follow_command);
if (r != 0)
{
- log_err(_("follow command failed. You could check and try it manually.\n"));
- terminate(ERR_BAD_CONFIG);
- }
-
- /* and reconnect to the local database */
- my_local_conn = establish_db_connection(local_options.conninfo, true);
-
- /* update node information to reflect new status */
- if (update_node_record_set_upstream(new_master_conn, local_options.cluster_name, node_info.node_id, best_candidate.node_id) == false)
- {
appendPQExpBuffer(&event_details,
- _("Unable to update node record for node %i (following new upstream node %i)"),
- node_info.node_id,
- best_candidate.node_id);
+ _("Unable to execute follow command:\n %s"),
+ local_options.follow_command);
log_err("%s\n", event_details.data);
- create_event_record(new_master_conn,
+ /* It won't be possible to write to the event notification
+ * table but we should be able to generate an external notification
+ * if required.
+ */
+ create_event_record(NULL,
&local_options,
node_info.node_id,
"repmgrd_failover_follow",
@@ -1537,13 +1517,20 @@ do_master_failover(void)
terminate(ERR_BAD_CONFIG);
}
+ /* and reconnect to the local database */
+ my_local_conn = establish_db_connection(local_options.conninfo, true);
+
/* update internal record for this node*/
+ new_master_conn = establish_db_connection(best_candidate.conninfo_str, true);
+
node_info = get_node_info(new_master_conn, local_options.cluster_name, local_options.node);
appendPQExpBuffer(&event_details,
_("Node %i now following new upstream node %i"),
node_info.node_id,
best_candidate.node_id);
+ log_info("%s\n", event_details.data);
+
create_event_record(new_master_conn,
&local_options,
node_info.node_id,
@@ -1570,6 +1557,8 @@ do_master_failover(void)
* It might be worth providing a selection of reconnection strategies
* as different behaviour might be desirable in different situations;
* or maybe the option not to reconnect might be required?
+ *
+ * XXX check this handles replication slots gracefully
*/
static bool
do_upstream_standby_failover(t_node_info upstream_node)
@@ -1578,6 +1567,7 @@ do_upstream_standby_failover(t_node_info upstream_node)
char sqlquery[QUERY_STR_LEN];
int upstream_node_id = node_info.upstream_node_id;
int r;
+ PQExpBufferData event_details;
log_debug(_("do_upstream_standby_failover(): performing failover for node %i\n"),
node_info.node_id);
@@ -1647,26 +1637,65 @@ do_upstream_standby_failover(t_node_info upstream_node)
}
PQclear(res);
- sleep(local_options.reconnect_intvl);
+ sleep(local_options.reconnect_interval);
}
/* Close the connection to this server */
PQfinish(my_local_conn);
my_local_conn = NULL;
+ initPQExpBuffer(&event_details);
+
/* Follow new upstream */
r = system(local_options.follow_command);
if (r != 0)
{
- log_err(_("follow command failed. You could check and try it manually.\n"));
+ appendPQExpBuffer(&event_details,
+ _("Unable to execute follow command:\n %s"),
+ local_options.follow_command);
+
+ log_err("%s\n", event_details.data);
+
+ /* It won't be possible to write to the event notification
+ * table but we should be able to generate an external notification
+ * if required.
+ */
+ create_event_record(NULL,
+ &local_options,
+ node_info.node_id,
+ "repmgrd_failover_follow",
+ false,
+ event_details.data);
terminate(ERR_BAD_CONFIG);
}
if (update_node_record_set_upstream(master_conn, local_options.cluster_name, node_info.node_id, upstream_node_id) == false)
{
+ appendPQExpBuffer(&event_details,
+ _("Unable to set node %i's new upstream ID to %i"),
+ node_info.node_id,
+ upstream_node_id);
+ create_event_record(NULL,
+ &local_options,
+ node_info.node_id,
+ "repmgrd_failover_follow",
+ false,
+ event_details.data);
terminate(ERR_BAD_CONFIG);
}
+ appendPQExpBuffer(&event_details,
+ _("Node %i is now following upstream node %i"),
+ node_info.node_id,
+ upstream_node_id);
+
+ create_event_record(NULL,
+ &local_options,
+ node_info.node_id,
+ "repmgrd_failover_follow",
+ true,
+ event_details.data);
+
my_local_conn = establish_db_connection(local_options.conninfo, true);
return true;
@@ -1681,7 +1710,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
/*
* Check if the node is still available if after
- * local_options.reconnect_attempts * local_options.reconnect_intvl
+ * local_options.reconnect_attempts * local_options.reconnect_interval
* seconds of retries we cannot reconnect return false
*/
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
@@ -1699,9 +1728,9 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
{
log_warning(_("connection to %s has been lost, trying to recover... %i seconds before failover decision\n"),
type,
- (local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
- /* wait local_options.reconnect_intvl seconds between retries */
- sleep(local_options.reconnect_intvl);
+ (local_options.reconnect_interval * (local_options.reconnect_attempts - connection_retries)));
+ /* wait local_options.reconnect_interval seconds between retries */
+ sleep(local_options.reconnect_interval);
}
else
{
@@ -1728,7 +1757,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
/*
- * set_local_node_failed()
+ * set_local_node_status()
*
* If failure of the local node is detected, attempt to connect
* to the current master server (as stored in the global variable
@@ -1736,16 +1765,16 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
*/
static bool
-set_local_node_failed(void)
+set_local_node_status(void)
{
- PGresult *res;
+ PGresult *res;
char sqlquery[QUERY_STR_LEN];
- int active_master_node_id = NODE_NOT_FOUND;
+ int active_master_node_id = NODE_NOT_FOUND;
char master_conninfo[MAXLEN];
if (!check_connection(&master_conn, "master", NULL))
{
- log_err(_("set_local_node_failed(): Unable to connect to last known master node\n"));
+ log_err(_("set_local_node_status(): Unable to connect to last known master node\n"));
return false;
}
@@ -1799,17 +1828,16 @@ set_local_node_failed(void)
/*
- * Attempt to set own record as inactive
+ * Attempt to set the active record to the correct value.
+ * First
*/
- sqlquery_snprintf(sqlquery,
- "UPDATE %s.repl_nodes "
- " SET active = FALSE "
- " WHERE id = %i ",
- get_repmgr_schema_quoted(master_conn),
- node_info.node_id);
- res = PQexec(master_conn, sqlquery);
- if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ if (!update_node_record_status(master_conn,
+ local_options.cluster_name,
+ node_info.node_id,
+ "standby",
+ node_info.upstream_node_id,
+ is_standby(my_local_conn)==1))
{
log_err(_("unable to set local node %i as inactive on master: %s\n"),
node_info.node_id,
@@ -1834,7 +1862,7 @@ check_cluster_configuration(PGconn *conn)
sqlquery_snprintf(sqlquery,
"SELECT oid FROM pg_class "
" WHERE oid = '%s.repl_nodes'::regclass ",
- get_repmgr_schema());
+ get_repmgr_schema_quoted(master_conn));
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
@@ -1961,18 +1989,18 @@ lsn_to_xlogrecptr(char *lsn, bool *format_ok)
void
usage(void)
{
- log_err(_("%s: Replicator manager daemon \n"), progname);
- log_err(_("Try \"%s --help\" for more information.\n"), progname);
+ log_err(_("%s: Replicator manager daemon \n"), progname());
+ log_err(_("Try \"%s --help\" for more information.\n"), progname());
}
void
-help(const char *progname)
+help(void)
{
- printf(_("%s: replication management daemon for PostgreSQL\n"), progname);
+ printf(_("%s: replication management daemon for PostgreSQL\n"), progname());
printf(_("\n"));
printf(_("Usage:\n"));
- printf(_(" %s [OPTIONS]\n"), progname);
+ printf(_(" %s [OPTIONS]\n"), progname());
printf(_("\n"));
printf(_("Options:\n"));
printf(_(" -?, --help show this help, then exit\n"));
@@ -1983,7 +2011,7 @@ help(const char *progname)
printf(_(" -d, --daemonize detach process from foreground\n"));
printf(_(" -p, --pid-file=PATH write a PID file\n"));
printf(_("\n"));
- printf(_("%s monitors a cluster of servers and optionally performs failover.\n"), progname);
+ printf(_("%s monitors a cluster of servers and optionally performs failover.\n"), progname());
}
@@ -2021,7 +2049,7 @@ terminate(int retval)
unlink(pid_file);
}
- log_info(_("%s terminating...\n"), progname);
+ log_info(_("%s terminating...\n"), progname());
exit(retval);
}
@@ -2228,7 +2256,7 @@ get_node_info(PGconn *conn, char *cluster, int node_id)
{
PGresult *res;
- t_node_info node_info = { NODE_NOT_FOUND, NO_UPSTREAM_NODE, "", InvalidXLogRecPtr, UNKNOWN, false, false};
+ t_node_info node_info = T_NODE_INFO_INITIALIZER;
res = get_node_record(conn, cluster, node_id);
diff --git a/version.h b/version.h
index 63f4678..6affd7a 100644
--- a/version.h
+++ b/version.h
@@ -1,6 +1,6 @@
#ifndef _VERSION_H_
#define _VERSION_H_
-#define REPMGR_VERSION "3.0.2"
+#define REPMGR_VERSION "3.0.3"
#endif