aboutsummaryrefslogtreecommitdiffstats
path: root/includes/libs/rdbms/database/QueryBuilderFromRawSql.php
blob: 3be8e0e13f69d9b23695465daf5bdd6aa994f4fc (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
<?php
/**
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 */
namespace Wikimedia\Rdbms;

use Wikimedia\Rdbms\Platform\SQLPlatform;

/**
 * This is to contain any regex on SQL work and get rid of them eventually
 *
 * This is a radioactive swamp and an extremely flawed and buggy last resort
 * for when the information has not been provided via Query object.
 * Bugs are to be expected in the regexes here.
 *
 * @ingroup Database
 * @internal
 * @since 1.41
 */
class QueryBuilderFromRawSql {
	/** All the bits of QUERY_WRITE_* flags */
	private const QUERY_CHANGE_MASK = (
		SQLPlatform::QUERY_CHANGE_NONE |
		SQLPlatform::QUERY_CHANGE_TRX |
		SQLPlatform::QUERY_CHANGE_ROWS |
		SQLPlatform::QUERY_CHANGE_SCHEMA |
		SQLPlatform::QUERY_CHANGE_LOCKS
	);

	private const SCHEMA_CHANGE_VERBS = [
		'CREATE',
		'CREATE TEMPORARY',
		'CREATE INDEX',
		'CREATE DATABASE',
		'ALTER',
		'ALTER DATABASE',
		'DROP',
		'DROP INDEX',
		'DROP DATABASE',
	];

	private const TRX_VERBS = [
		'BEGIN',
		'COMMIT',
		'ROLLBACK',
		'SAVEPOINT',
		'RELEASE SAVEPOINT',
		'ROLLBACK TO SAVEPOINT',
	];

	private static string $queryVerbRegex;

	/**
	 * @param string $sql
	 * @param int $flags
	 * @param string $tablePrefix
	 * @return Query
	 */
	public static function buildQuery( string $sql, $flags, string $tablePrefix = '' ) {
		$verb = self::getQueryVerb( $sql );

		if ( ( $flags & self::QUERY_CHANGE_MASK ) == 0 ) {
			$isWriteQuery = self::isWriteQuery( $sql );
			if ( $isWriteQuery ) {
				if ( in_array( $verb, self::SCHEMA_CHANGE_VERBS, true ) ) {
					$flags |= SQLPlatform::QUERY_CHANGE_SCHEMA;
				} else {
					$flags |= SQLPlatform::QUERY_CHANGE_ROWS;
				}
			} else {
				if ( in_array( $verb, self::TRX_VERBS, true ) ) {
					$flags |= SQLPlatform::QUERY_CHANGE_TRX;
				} else {
					$flags |= SQLPlatform::QUERY_CHANGE_NONE;
				}
			}
		}

		return new Query(
			$sql,
			$flags,
			$verb,
			self::getWriteTable( $sql, $tablePrefix )
		);
	}

	private static function isWriteQuery( $rawSql ) {
		// Treat SELECT queries without FOR UPDATE queries as non-writes. This matches
		// how MySQL enforces read_only (FOR SHARE and LOCK IN SHADE MODE are allowed).
		// Handle (SELECT ...) UNION (SELECT ...) queries in a similar fashion.
		if ( preg_match( '/^\s*\(?SELECT\b/i', $rawSql ) ) {
			return (bool)preg_match( '/\bFOR\s+UPDATE\)?\s*$/i', $rawSql );
		}
		// BEGIN and COMMIT queries are considered non-write queries here.
		// Database backends and drivers (MySQL, MariaDB, php-mysqli) generally
		// treat these as write queries, in that their results have "affected rows"
		// as meta data as from writes, instead of "num rows" as from reads.
		// But, we treat them as non-write queries because when reading data (from
		// either replica or primary DB) we use transactions to enable repeatable-read
		// snapshots, which ensures we get consistent results from the same snapshot
		// for all queries within a request. Use cases:
		// - Treating these as writes would trigger ChronologyProtector (see method doc).
		// - We use this method to reject writes to replicas, but we need to allow
		//   use of transactions on replicas for read snapshots. This is fine given
		//   that transactions by themselves don't make changes, only actual writes
		//   within the transaction matter, which we still detect.
		return !preg_match(
			'/^\s*(BEGIN|ROLLBACK|COMMIT|SAVEPOINT|RELEASE|SET|SHOW|EXPLAIN|USE)\b/i',
			$rawSql
		);
	}

	/**
	 * @param string $sql SQL query
	 * @return string
	 */
	private static function getQueryVerb( $sql ) {
		// @phan-suppress-next-line PhanRedundantCondition https://github.com/phan/phan/issues/4720
		if ( !isset( self::$queryVerbRegex ) ) {
			$multiwordVerbsRegex = implode( '|', array_map(
				static fn ( $words ) => str_replace( ' ', '\s+', $words ),
				Query::MULTIWORD_VERBS
			) );
			self::$queryVerbRegex = "/^\s*($multiwordVerbsRegex|[a-z]+)/i";
		}
		return preg_match( self::$queryVerbRegex, $sql, $m ) ? strtoupper( $m[1] ) : '';
	}

	/**
	 * @param string $sql
	 * @param string $tablePrefix
	 * @return string|null
	 */
	private static function getWriteTable( $sql, $tablePrefix ) {
		// Regex for basic queries that can create/change/drop temporary tables.
		// For simplicity, this only looks for tables with sensible alphanumeric names.
		// Temporary tables only need simple programming names anyway.
		$regex = <<<REGEX
		/^
			(?:
				(?:INSERT|REPLACE)\s+(?:\w+\s+)*?INTO
				| UPDATE(?:\s+OR\s+\w+|\s+IGNORE|\s+ONLY)?
				| DELETE\s+(?:\w+\s+)*?FROM(?:\s+ONLY)?
				| CREATE\s+(?:TEMPORARY\s+)?TABLE(?:\s+IF\s+NOT\s+EXISTS)?
				| DROP\s+(?:TEMPORARY\s+)?TABLE(?:\s+IF\s+EXISTS)?
				| TRUNCATE\s+(?:TEMPORARY\s+)?TABLE
				| ALTER\s+TABLE
			) \s+
			(\w+|`\w+`|'\w+'|"\w+")
		/ix
		REGEX;
		if ( preg_match( $regex, $sql, $m ) ) {
			$tableName = trim( $m[1], "\"'`" );
			if ( str_starts_with( $tableName, $tablePrefix ) ) {
				$tableName = substr( $tableName, strlen( $tablePrefix ) );
			}
			return $tableName;
		}
		return null;
	}

	/**
	 * Removes most variables from an SQL query and replaces them with X or N for numbers.
	 * It's only slightly flawed. Don't use for anything important.
	 *
	 * @param string $sql A SQL Query
	 *
	 * @return string
	 */
	public static function generalizeSQL( $sql ) {
		# This does the same as the regexp below would do, but in such a way
		# as to avoid crashing php on some large strings.
		# $sql = preg_replace( "/'([^\\\\']|\\\\.)*'|\"([^\\\\\"]|\\\\.)*\"/", "'X'", $sql );

		$sql = str_replace( "\\\\", '', $sql );
		$sql = str_replace( "\\'", '', $sql );
		$sql = str_replace( "\\\"", '', $sql );
		$sql = preg_replace( "/'.*'/s", "'X'", $sql );
		$sql = preg_replace( '/".*"/s', "'X'", $sql );

		# All newlines, tabs, etc replaced by single space
		$sql = preg_replace( '/\s+/', ' ', $sql );

		# All numbers => N,
		# except the ones surrounded by characters, e.g. l10n
		$sql = preg_replace( '/-?\d++(,-?\d++)+/', 'N,...,N', $sql );
		$sql = preg_replace( '/(?<![a-zA-Z])-?\d+(?![a-zA-Z])/', 'N', $sql );

		return $sql;
	}
}