aboutsummaryrefslogtreecommitdiffstats
path: root/includes
diff options
context:
space:
mode:
authorTim Starling <tstarling@wikimedia.org>2023-12-04 11:09:12 +1100
committerTim Starling <tstarling@wikimedia.org>2023-12-04 20:20:32 +1100
commit497efa4ed6b8630e414c4c917ed95f66ee6235fd (patch)
treea7655661bf60fca376e57ef908b05345b69a9f8f /includes
parenta007f6e09a9ed65a77e51dfe0435f85e9841d23c (diff)
downloadmediawikicore-497efa4ed6b8630e414c4c917ed95f66ee6235fd.tar.gz
mediawikicore-497efa4ed6b8630e414c4c917ed95f66ee6235fd.zip
Clean up DjVuImage::retrieveMetaData including shellbox support
Following up Id9539a28f0f143539334002c3: * Don't run the script twice. * Wrap the decoded dump in an array with key "data". * The default assignment for DJVU_DUMP and DJVU_TXT had the effect of running the scripts anyway if the configuration variables are null. Don't do that. * If both $wgDjvuDump and $wgDjvuTxt are null, don't run the shellbox script. * Centralise shell location configuration. * Factor out call to convertDumpToJSON(). * Instead of txt_exit_code, just use existence of the file to communicate success. This avoids a deprecation warning if txt_exit_code was not received, due to passing null to trim. * Check for the existence of the result files instead of just trying to use them. * Check the exit status of the overall script. * Confirm that the BoxedCommand branch is functional and works in CLI mode by using it in DjvuTest. * Change the service name from "media" to "djvu". Existing examples are "pagedtiffhandler" and "pdfhandler", i.e. named after the extension, there is no other core caller. I think it should be more fine-grained than "media". The name was possibly a conflation with the ProductionServices array key. Also: * Check the exit status of djvudump and don't use the output file if it is non-zero. * Check the return value of convertDumpToJSON(). * Don't use isset() unless error suppression is intended. Bug: T352515 Change-Id: If41a2baada2e4e2462518c1f437af458feb29632
Diffstat (limited to 'includes')
-rw-r--r--includes/MainConfigNames.php12
-rw-r--r--includes/MainConfigSchema.php30
-rw-r--r--includes/config-schema.php10
-rw-r--r--includes/media/DjVuImage.php111
-rw-r--r--includes/media/scripts/retrieveDjvuMetaData.sh29
5 files changed, 117 insertions, 75 deletions
diff --git a/includes/MainConfigNames.php b/includes/MainConfigNames.php
index a64b754ca36c..2f67b39ad4d3 100644
--- a/includes/MainConfigNames.php
+++ b/includes/MainConfigNames.php
@@ -873,12 +873,6 @@ class MainConfigNames {
public const DjvuUseBoxedCommand = 'DjvuUseBoxedCommand';
/**
- * Name constant for the DjvuShell setting, for use with Config::get()
- * @see MainConfigSchema::DjvuShell
- */
- public const DjvuShell = 'DjvuShell';
-
- /**
* Name constant for the DjvuDump setting, for use with Config::get()
* @see MainConfigSchema::DjvuDump
*/
@@ -4323,6 +4317,12 @@ class MainConfigNames {
public const ShellboxSecretKey = 'ShellboxSecretKey';
/**
+ * Name constant for the ShellboxShell setting, for use with Config::get()
+ * @see MainConfigSchema::ShellboxShell
+ */
+ public const ShellboxShell = 'ShellboxShell';
+
+ /**
* Name constant for the HTTPTimeout setting, for use with Config::get()
* @see MainConfigSchema::HTTPTimeout
*/
diff --git a/includes/MainConfigSchema.php b/includes/MainConfigSchema.php
index c08fee9e9bfb..51e51602e85b 100644
--- a/includes/MainConfigSchema.php
+++ b/includes/MainConfigSchema.php
@@ -2478,21 +2478,20 @@ class MainConfigSchema {
/**
* Whether to use BoxedCommand or not. Temporary feature flag for T352515
+ *
+ * @since 1.42
*/
public const DjvuUseBoxedCommand = [
'default' => false,
];
- /**
- * When using BoxedCommand, use this shell to extract the djvu metadata
- */
- public const DjvuShell = [
- 'default' => '/bin/sh',
- 'type' => '?string',
- ];
+
/**
* Path of the djvudump executable
* Enable this and $wgDjvuRenderer to enable djvu rendering
* example: $wgDjvuDump = 'djvudump';
+ *
+ * If this is set, {@link self::ShellboxShell} must be set to the correct
+ * shell path.
*/
public const DjvuDump = [
'default' => null,
@@ -2513,6 +2512,9 @@ class MainConfigSchema {
* Path of the djvutxt DJVU text extraction utility
* Enable this and $wgDjvuDump to enable text layer extraction from djvu files
* example: $wgDjvuTxt = 'djvutxt';
+ *
+ * If this is set, {@link self::ShellboxShell} must be set to the correct
+ * shell path.
*/
public const DjvuTxt = [
'default' => null,
@@ -12397,6 +12399,20 @@ class MainConfigSchema {
'type' => '?string',
];
+ /**
+ * The POSIX-compatible shell to use when running scripts. This is used by
+ * some media handling shell commands.
+ *
+ * If ShellboxUrls is configured, this path should exist on the remote side.
+ * On Windows this should be the full path to bash.exe, not git-bash.exe.
+ *
+ * @since 1.42
+ */
+ public const ShellboxShell = [
+ 'default' => '/bin/sh',
+ 'type' => '?string',
+ ];
+
// endregion -- end Shell and process control
/***************************************************************************/
diff --git a/includes/config-schema.php b/includes/config-schema.php
index 80debe91fa6a..f905e3271d19 100644
--- a/includes/config-schema.php
+++ b/includes/config-schema.php
@@ -318,7 +318,6 @@ return [
'ResponsiveImages' => true,
'ImagePreconnect' => false,
'DjvuUseBoxedCommand' => false,
- 'DjvuShell' => '/bin/sh',
'DjvuDump' => null,
'DjvuRenderer' => null,
'DjvuTxt' => null,
@@ -2457,6 +2456,7 @@ return [
'default' => null,
],
'ShellboxSecretKey' => null,
+ 'ShellboxShell' => '/bin/sh',
'HTTPTimeout' => 25,
'HTTPConnectTimeout' => 5.0,
'HTTPMaxTimeout' => 0,
@@ -2613,10 +2613,6 @@ return [
],
'UploadThumbnailRenderMap' => 'object',
'GalleryOptions' => 'object',
- 'DjvuShell' => [
- 0 => 'string',
- 1 => 'null',
- ],
'DjvuDump' => [
0 => 'string',
1 => 'null',
@@ -2961,6 +2957,10 @@ return [
0 => 'string',
1 => 'null',
],
+ 'ShellboxShell' => [
+ 0 => 'string',
+ 1 => 'null',
+ ],
'HTTPTimeout' => 'number',
'HTTPConnectTimeout' => 'number',
'HTTPMaxTimeout' => 'number',
diff --git a/includes/media/DjVuImage.php b/includes/media/DjVuImage.php
index 14fac90bc236..907fa214d03b 100644
--- a/includes/media/DjVuImage.php
+++ b/includes/media/DjVuImage.php
@@ -254,96 +254,113 @@ class DjVuImage {
$djvuDump = $config->get( MainConfigNames::DjvuDump );
$djvuTxt = $config->get( MainConfigNames::DjvuTxt );
$djvuUseBoxedCommand = $config->get( MainConfigNames::DjvuUseBoxedCommand );
- $djvuShell = $config->get( MainConfigNames::DjvuShell );
+ $shell = $config->get( MainConfigNames::ShellboxShell );
if ( !$this->isValid() ) {
return false;
}
- $retval = null;
+ if ( $djvuTxt === null && $djvuDump === null ) {
+ return [];
+ }
+
+ $txt = null;
+ $dump = null;
if ( $djvuUseBoxedCommand ) {
$command = MediaWikiServices::getInstance()->getShellCommandFactory()
- ->createBoxed( 'media' )
+ ->createBoxed( 'djvu' )
->disableNetwork()
->firejailDefaultSeccomp()
- ->routeName( 'djvu-metadata' );
- $command
- ->params( $djvuShell, 'scripts/retrieveDjvuMetaData.sh' )
+ ->routeName( 'djvu-metadata' )
+ ->params( $shell, 'scripts/retrieveDjvuMetaData.sh' )
->inputFileFromFile(
'scripts/retrieveDjvuMetaData.sh',
__DIR__ . '/scripts/retrieveDjvuMetaData.sh' )
->inputFileFromFile( 'file.djvu', $this->mFilename )
->memoryLimit( self::DJVUTXT_MEMORY_LIMIT );
$env = [];
- if ( isset( $djvuDump ) ) {
+ if ( $djvuDump !== null ) {
$env['DJVU_DUMP'] = $djvuDump;
$command->outputFileToString( 'dump' );
- $result = $command->environment( $env )->execute();
}
- if ( isset( $djvuTxt ) ) {
+ if ( $djvuTxt !== null ) {
$env['DJVU_TXT'] = $djvuTxt;
$command->outputFileToString( 'txt' );
- $command->outputFileToString( 'txt_exit_code' );
}
$result = $command
->environment( $env )
->execute();
- if ( isset( $djvuDump ) ) {
- $dump = $result->getFileContents( 'dump' );
- $json = $this->convertDumpToJSON( $dump );
- } else {
- $json = null;
+ if ( $result->getExitCode() !== 0 ) {
+ wfDebug( 'retrieveDjvuMetaData failed with exit code ' . $result->getExitCode() );
+ return false;
}
- if ( isset( $djvuTxt ) ) {
- $retval = (int)trim( $result->getFileContents( 'txt_exit_code' ) );
- $txt = $result->getFileContents( 'txt' );
+ if ( $djvuDump !== null ) {
+ if ( $result->wasReceived( 'dump' ) ) {
+ $dump = $result->getFileContents( 'dump' );
+ } else {
+ wfDebug( __METHOD__ . ": did not receive dump file" );
+ }
+ }
+
+ if ( $djvuTxt !== null ) {
+ if ( $result->wasReceived( 'txt' ) ) {
+ $txt = $result->getFileContents( 'txt' );
+ } else {
+ wfDebug( __METHOD__ . ": did not receive text file" );
+ }
}
} else { // No boxedcommand
- if ( isset( $djvuDump ) ) {
+ if ( $djvuDump !== null ) {
# djvudump is faster than djvutoxml (now abandoned) as of version 3.5
# https://sourceforge.net/p/djvu/bugs/71/
$cmd = Shell::escape( $djvuDump ) . ' ' . Shell::escape( $this->mFilename );
$dump = wfShellExec( $cmd );
- $json = [ 'data' => $this->convertDumpToJSON( $dump ) ];
- } else {
- $json = null;
}
- if ( isset( $djvuTxt ) ) {
+ if ( $djvuTxt !== null ) {
$cmd = Shell::escape( $djvuTxt ) . ' --detail=page ' . Shell::escape( $this->mFilename );
wfDebug( __METHOD__ . ": $cmd" );
$retval = 0;
$txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] );
+ if ( $retval !== 0 ) {
+ $txt = null;
+ }
}
}
- $txt ??= '';
+ # Convert dump to array
+ $json = [];
+ if ( $dump !== null ) {
+ $data = $this->convertDumpToJSON( $dump );
+ if ( $data !== false ) {
+ $json = [ 'data' => $data ];
+ }
+ }
# Text layer
- if ( isset( $djvuTxt ) ) {
- $json['text'] = [];
- if ( $retval === 0 ) {
- # Strip some control characters
- # Ignore carriage returns
- $txt = preg_replace( "/\\\\013/", "", $txt );
- # Replace runs of OCR region separators with a single extra line break
- $txt = preg_replace( "/(?:\\\\(035|037))+/", "\n", $txt );
-
- $reg = <<<EOR
- /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
- ((?> # Text to match is composed of atoms of either:
- \\\\. # - any escaped character
- | # - any character different from " and \
- [^"\\\\]+
- )*?)
- "\s*\)
- | # Or page can be empty ; in this case, djvutxt dumps ()
- \(\s*()\)/sx
+ if ( $txt !== null ) {
+ # Strip some control characters
+ # Ignore carriage returns
+ $txt = preg_replace( "/\\\\013/", "", $txt );
+ # Replace runs of OCR region separators with a single extra line break
+ $txt = preg_replace( "/(?:\\\\(035|037))+/", "\n", $txt );
+
+ $reg = <<<EOR
+ /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
+ ((?> # Text to match is composed of atoms of either:
+ \\\\. # - any escaped character
+ | # - any character different from " and \
+ [^"\\\\]+
+ )*?)
+ "\s*\)
+ | # Or page can be empty ; in this case, djvutxt dumps ()
+ \(\s*()\)/sx
EOR;
- $matches = [];
- preg_match_all( $reg, $txt, $matches );
- $json['text'] = array_map( [ $this, 'pageTextCallback' ], $matches[1] );
- }
+ $matches = [];
+ preg_match_all( $reg, $txt, $matches );
+ $json['text'] = array_map( [ $this, 'pageTextCallback' ], $matches[1] );
+ } else {
+ $json['text'] = [];
}
return $json;
diff --git a/includes/media/scripts/retrieveDjvuMetaData.sh b/includes/media/scripts/retrieveDjvuMetaData.sh
index 601b20a97d4c..5ca0fb6a33aa 100644
--- a/includes/media/scripts/retrieveDjvuMetaData.sh
+++ b/includes/media/scripts/retrieveDjvuMetaData.sh
@@ -1,23 +1,32 @@
#!/bin/sh
-# Get parameters from environment
-export DJVU_DUMP="${DJVU_DUMP:-djvudump}"
-export DJVU_TXT="${DJVU_TXT:-djvutxt}"
+
runDump() {
+ local ret
# djvudump is faster than djvutoxml (now abandoned) as of version 3.5
# https://sourceforge.net/p/djvu/bugs/71/
"$DJVU_DUMP" file.djvu > dump
+ ret="$?"
+ if [ "$ret" -ne 0 ]; then
+ echo "djvudump failed with exit code $ret" 1>&2
+ rm -f dump
+ fi
}
runTxt() {
+ local ret
# Text layer
- "$DJVU_TXT" \
- --detail=page \
- file.djvu > txt
- # Store exit code so we can use it later
- echo $? > txt_exit_code
+ if ! "$DJVU_TXT" --detail=page file.djvu > txt; then
+ rm -f txt
+ fi
+ ret="$?"
+ if [ "$ret" -ne 0 ]; then
+ echo "djvutxt failed with exit code $ret" 1>&2
+ rm -f txt
+ fi
}
-if [ -x "$DJVU_DUMP" ]; then
+if [ -n "$DJVU_DUMP" ]; then
runDump
fi
-if [ -x "$DJVU_TXT" ]; then
+if [ -n "$DJVU_TXT" ]; then
runTxt
fi
+exit 0