From 0c92bc8a96fb23a682d8640ba8b5b5076c28c19c Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Tue, 19 Jul 2022 09:54:21 +1000 Subject: rdbms: Instrument LoadBalancer with statsd metrics * Export the LoadMonitor weight metric to statsd on each global cache miss. * Export the replication lag at the same time. This can replace the getLagTimes.php cron job and should eliminate the sawtooth effect seen in the cron job exports, which is presumably due to the offset between cron job start time and heartbeat time. It should also work around the bug which causes s3 to be missing. Also: * Fix a log message in TransactionProfiler. actualSeconds -> actual since this log message is used for counts as well as time. Bug: T313004 Change-Id: I61e45870d750019ed2c92f45b2f8b9c33a7e7d65 --- includes/libs/rdbms/TransactionProfiler.php | 4 +- includes/libs/rdbms/lbfactory/LBFactory.php | 7 +++- includes/libs/rdbms/loadbalancer/LoadBalancer.php | 6 +++ includes/libs/rdbms/loadmonitor/ILoadMonitor.php | 3 +- includes/libs/rdbms/loadmonitor/LoadMonitor.php | 45 +++++++++++++++------- .../libs/rdbms/loadmonitor/LoadMonitorNull.php | 4 ++ 6 files changed, 51 insertions(+), 18 deletions(-) (limited to 'includes/libs') diff --git a/includes/libs/rdbms/TransactionProfiler.php b/includes/libs/rdbms/TransactionProfiler.php index 38e44cd6e008..d42189fdd571 100644 --- a/includes/libs/rdbms/TransactionProfiler.php +++ b/includes/libs/rdbms/TransactionProfiler.php @@ -452,7 +452,7 @@ class TransactionProfiler implements LoggerAwareInterface { $max = $this->expect[$expectation][self::FLD_LIMIT]; $by = $this->expect[$expectation][self::FLD_FNAME]; - $message = "Expectation ($expectation <=) $max by $by not met (actual: {actual})"; + $message = "Expectation ($expectation <= $max) by $by not met (actual: {actual})"; if ( $trxId ) { $message .= ' in trx #{trxId}'; } @@ -463,7 +463,7 @@ class TransactionProfiler implements LoggerAwareInterface { 'measure' => $expectation, 'maxSeconds' => $max, 'by' => $by, - 'actualSeconds' => $actual, + 'actual' => $actual, 'query' => $this->getGeneralizedSql( $query ), 'exception' => new RuntimeException(), 'trxId' => $trxId, diff --git a/includes/libs/rdbms/lbfactory/LBFactory.php b/includes/libs/rdbms/lbfactory/LBFactory.php index 36797563c7ef..caa791c9d567 100644 --- a/includes/libs/rdbms/lbfactory/LBFactory.php +++ b/includes/libs/rdbms/lbfactory/LBFactory.php @@ -27,7 +27,9 @@ use BagOStuff; use EmptyBagOStuff; use Exception; use Generator; +use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface; use LogicException; +use NullStatsdDataFactory; use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; use RuntimeException; @@ -52,6 +54,8 @@ abstract class LBFactory implements ILBFactory { private $profiler; /** @var TransactionProfiler */ private $trxProfiler; + /** @var StatsdDataFactoryInterface */ + private $statsd; /** @var LoggerInterface */ private $replLogger; /** @var LoggerInterface */ @@ -71,7 +75,6 @@ abstract class LBFactory implements ILBFactory { protected $srvCache; /** @var WANObjectCache */ protected $wanCache; - /** @var DatabaseDomain Local domain */ protected $localDomain; @@ -151,6 +154,7 @@ abstract class LBFactory implements ILBFactory { $this->profiler = $conf['profiler'] ?? null; $this->trxProfiler = $conf['trxProfiler'] ?? new TransactionProfiler(); + $this->statsd = $conf['statsdDataFactory'] ?? new NullStatsdDataFactory(); $this->csProvider = $conf['criticalSectionProvider'] ?? null; @@ -670,6 +674,7 @@ abstract class LBFactory implements ILBFactory { 'perfLogger' => $this->perfLogger, 'errorLogger' => $this->errorLogger, 'deprecationLogger' => $this->deprecationLogger, + 'statsdDataFactory' => $this->statsd, 'cliMode' => $this->cliMode, 'agent' => $this->agent, 'maxLag' => $this->maxLag, diff --git a/includes/libs/rdbms/loadbalancer/LoadBalancer.php b/includes/libs/rdbms/loadbalancer/LoadBalancer.php index 450a641ea38f..a3ffe2626948 100644 --- a/includes/libs/rdbms/loadbalancer/LoadBalancer.php +++ b/includes/libs/rdbms/loadbalancer/LoadBalancer.php @@ -25,7 +25,9 @@ use ArrayUtils; use BagOStuff; use EmptyBagOStuff; use InvalidArgumentException; +use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface; use LogicException; +use NullStatsdDataFactory; use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; use RuntimeException; @@ -58,6 +60,8 @@ class LoadBalancer implements ILoadBalancerForOwner { private $profiler; /** @var TransactionProfiler */ private $trxProfiler; + /** @var StatsdDataFactoryInterface */ + private $statsd; /** @var LoggerInterface */ private $connLogger; /** @var LoggerInterface */ @@ -232,6 +236,7 @@ class LoadBalancer implements ILoadBalancerForOwner { $this->clusterName = $params['clusterName'] ?? null; $this->profiler = $params['profiler'] ?? null; $this->trxProfiler = $params['trxProfiler'] ?? new TransactionProfiler(); + $this->statsd = $params['statsdDataFactory'] ?? new NullStatsdDataFactory(); $this->csProvider = $params['criticalSectionProvider'] ?? null; @@ -419,6 +424,7 @@ class LoadBalancer implements ILoadBalancerForOwner { $this->loadMonitor = new $class( $this, $this->srvCache, $this->wanCache, $this->loadMonitorConfig ); $this->loadMonitor->setLogger( $this->replLogger ); + $this->loadMonitor->setStatsdDataFactory( $this->statsd ); } return $this->loadMonitor; diff --git a/includes/libs/rdbms/loadmonitor/ILoadMonitor.php b/includes/libs/rdbms/loadmonitor/ILoadMonitor.php index 2ee49f60b652..f7d5214c2e74 100644 --- a/includes/libs/rdbms/loadmonitor/ILoadMonitor.php +++ b/includes/libs/rdbms/loadmonitor/ILoadMonitor.php @@ -25,6 +25,7 @@ namespace Wikimedia\Rdbms; use BagOStuff; use Psr\Log\LoggerAwareInterface; +use StatsdAwareInterface; use WANObjectCache; /** @@ -32,7 +33,7 @@ use WANObjectCache; * * @ingroup Database */ -interface ILoadMonitor extends LoggerAwareInterface { +interface ILoadMonitor extends LoggerAwareInterface, StatsdAwareInterface { /** * Construct a new LoadMonitor with a given LoadBalancer parent * diff --git a/includes/libs/rdbms/loadmonitor/LoadMonitor.php b/includes/libs/rdbms/loadmonitor/LoadMonitor.php index 5c1a7983ac4a..f7e821d8062c 100644 --- a/includes/libs/rdbms/loadmonitor/LoadMonitor.php +++ b/includes/libs/rdbms/loadmonitor/LoadMonitor.php @@ -22,6 +22,8 @@ namespace Wikimedia\Rdbms; use BagOStuff; +use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface; +use NullStatsdDataFactory; use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; use RuntimeException; @@ -47,6 +49,8 @@ class LoadMonitor implements ILoadMonitor { protected $wanCache; /** @var LoggerInterface */ protected $replLogger; + /** @var StatsdDataFactoryInterface */ + protected $statsd; /** @var float Moving average ratio (e.g. 0.1 for 10% weight to new weight) */ private $movingAveRatio; @@ -83,6 +87,7 @@ class LoadMonitor implements ILoadMonitor { $this->srvCache = $srvCache; $this->wanCache = $wCache; $this->replLogger = new NullLogger(); + $this->statsd = new NullStatsdDataFactory(); $this->movingAveRatio = $options['movingAveRatio'] ?? 0.1; $this->lagWarnThreshold = $options['lagWarnThreshold'] ?? LoadBalancer::MAX_LAG_DEFAULT; @@ -92,6 +97,10 @@ class LoadMonitor implements ILoadMonitor { $this->replLogger = $logger; } + public function setStatsdDataFactory( StatsdDataFactoryInterface $statsFactory ) { + $this->statsd = $statsFactory; + } + final public function scaleLoads( array &$weightByServer, $domain ) { $serverIndexes = array_keys( $weightByServer ); $states = $this->getServerStates( $serverIndexes, $domain ); @@ -205,6 +214,7 @@ class LoadMonitor implements ILoadMonitor { } $priorScales = $priorStates ? $priorStates['weightScales'] : []; + $cluster = $this->lb->getClusterName(); $lagTimes = []; $weightScales = []; @@ -241,9 +251,12 @@ class LoadMonitor implements ILoadMonitor { $naiveScale, $this->movingAveRatio ); - // Scale from 0% to 100% of nominal weight - $weightScales[$i] = max( $newScale, 0.0 ); + $newScale = max( $newScale, 0.0 ); + + $weightScales[$i] = $newScale; + $statHost = str_replace( '.', '_', $host ); + $this->statsd->gauge( "loadbalancer.weight.$cluster.$statHost", $newScale ); // Mark replication lag on this server as "false" if it is unreachable if ( !$conn ) { @@ -257,26 +270,30 @@ class LoadMonitor implements ILoadMonitor { // Determine the amount of replication lag on this server try { - $lagTimes[$i] = $conn->getLag(); + $lag = $conn->getLag(); } catch ( DBError $e ) { // Mark the lag time as "false" if it cannot be queried - $lagTimes[$i] = false; + $lag = false; } + $lagTimes[$i] = $lag; - if ( $lagTimes[$i] === false ) { + if ( $lag === false ) { $this->replLogger->error( __METHOD__ . ": host {db_server} is not replicating?", [ 'db_server' => $host ] ); - } elseif ( $lagTimes[$i] > $this->lagWarnThreshold ) { - $this->replLogger->warning( - "Server {db_server} has {lag} seconds of lag (>= {maxlag})", - [ - 'db_server' => $host, - 'lag' => $lagTimes[$i], - 'maxlag' => $this->lagWarnThreshold - ] - ); + } else { + $this->statsd->timing( "loadbalancer.lag.$cluster.$statHost", $lag * 1000 ); + if ( $lag > $this->lagWarnThreshold ) { + $this->replLogger->warning( + "Server {db_server} has {lag} seconds of lag (>= {maxlag})", + [ + 'db_server' => $host, + 'lag' => $lag, + 'maxlag' => $this->lagWarnThreshold + ] + ); + } } if ( $close ) { diff --git a/includes/libs/rdbms/loadmonitor/LoadMonitorNull.php b/includes/libs/rdbms/loadmonitor/LoadMonitorNull.php index cbc77f1a903e..a2a21e1048a5 100644 --- a/includes/libs/rdbms/loadmonitor/LoadMonitorNull.php +++ b/includes/libs/rdbms/loadmonitor/LoadMonitorNull.php @@ -22,6 +22,7 @@ namespace Wikimedia\Rdbms; use BagOStuff; +use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface; use Psr\Log\LoggerInterface; use WANObjectCache; @@ -34,6 +35,9 @@ class LoadMonitorNull implements ILoadMonitor { public function setLogger( LoggerInterface $logger ) { } + public function setStatsdDataFactory( StatsdDataFactoryInterface $statsFactory ) { + } + public function scaleLoads( array &$loads, $domain ) { } -- cgit v1.2.3