diff options
author | Tim Starling <tstarling@wikimedia.org> | 2023-12-04 11:09:12 +1100 |
---|---|---|
committer | Tim Starling <tstarling@wikimedia.org> | 2023-12-04 20:20:32 +1100 |
commit | 497efa4ed6b8630e414c4c917ed95f66ee6235fd (patch) | |
tree | a7655661bf60fca376e57ef908b05345b69a9f8f /includes | |
parent | a007f6e09a9ed65a77e51dfe0435f85e9841d23c (diff) | |
download | mediawikicore-497efa4ed6b8630e414c4c917ed95f66ee6235fd.tar.gz mediawikicore-497efa4ed6b8630e414c4c917ed95f66ee6235fd.zip |
Clean up DjVuImage::retrieveMetaData including shellbox support
Following up Id9539a28f0f143539334002c3:
* Don't run the script twice.
* Wrap the decoded dump in an array with key "data".
* The default assignment for DJVU_DUMP and DJVU_TXT had the effect of
running the scripts anyway if the configuration variables are null.
Don't do that.
* If both $wgDjvuDump and $wgDjvuTxt are null, don't run the shellbox
script.
* Centralise shell location configuration.
* Factor out call to convertDumpToJSON().
* Instead of txt_exit_code, just use existence of the file to
communicate success. This avoids a deprecation warning if
txt_exit_code was not received, due to passing null to trim.
* Check for the existence of the result files instead of just trying to
use them.
* Check the exit status of the overall script.
* Confirm that the BoxedCommand branch is functional and works in CLI
mode by using it in DjvuTest.
* Change the service name from "media" to "djvu". Existing examples are
"pagedtiffhandler" and "pdfhandler", i.e. named after the extension,
there is no other core caller. I think it should be more fine-grained
than "media". The name was possibly a conflation with the
ProductionServices array key.
Also:
* Check the exit status of djvudump and don't use the output file if it
is non-zero.
* Check the return value of convertDumpToJSON().
* Don't use isset() unless error suppression is intended.
Bug: T352515
Change-Id: If41a2baada2e4e2462518c1f437af458feb29632
Diffstat (limited to 'includes')
-rw-r--r-- | includes/MainConfigNames.php | 12 | ||||
-rw-r--r-- | includes/MainConfigSchema.php | 30 | ||||
-rw-r--r-- | includes/config-schema.php | 10 | ||||
-rw-r--r-- | includes/media/DjVuImage.php | 111 | ||||
-rw-r--r-- | includes/media/scripts/retrieveDjvuMetaData.sh | 29 |
5 files changed, 117 insertions, 75 deletions
diff --git a/includes/MainConfigNames.php b/includes/MainConfigNames.php index a64b754ca36c..2f67b39ad4d3 100644 --- a/includes/MainConfigNames.php +++ b/includes/MainConfigNames.php @@ -873,12 +873,6 @@ class MainConfigNames { public const DjvuUseBoxedCommand = 'DjvuUseBoxedCommand'; /** - * Name constant for the DjvuShell setting, for use with Config::get() - * @see MainConfigSchema::DjvuShell - */ - public const DjvuShell = 'DjvuShell'; - - /** * Name constant for the DjvuDump setting, for use with Config::get() * @see MainConfigSchema::DjvuDump */ @@ -4323,6 +4317,12 @@ class MainConfigNames { public const ShellboxSecretKey = 'ShellboxSecretKey'; /** + * Name constant for the ShellboxShell setting, for use with Config::get() + * @see MainConfigSchema::ShellboxShell + */ + public const ShellboxShell = 'ShellboxShell'; + + /** * Name constant for the HTTPTimeout setting, for use with Config::get() * @see MainConfigSchema::HTTPTimeout */ diff --git a/includes/MainConfigSchema.php b/includes/MainConfigSchema.php index c08fee9e9bfb..51e51602e85b 100644 --- a/includes/MainConfigSchema.php +++ b/includes/MainConfigSchema.php @@ -2478,21 +2478,20 @@ class MainConfigSchema { /** * Whether to use BoxedCommand or not. Temporary feature flag for T352515 + * + * @since 1.42 */ public const DjvuUseBoxedCommand = [ 'default' => false, ]; - /** - * When using BoxedCommand, use this shell to extract the djvu metadata - */ - public const DjvuShell = [ - 'default' => '/bin/sh', - 'type' => '?string', - ]; + /** * Path of the djvudump executable * Enable this and $wgDjvuRenderer to enable djvu rendering * example: $wgDjvuDump = 'djvudump'; + * + * If this is set, {@link self::ShellboxShell} must be set to the correct + * shell path. */ public const DjvuDump = [ 'default' => null, @@ -2513,6 +2512,9 @@ class MainConfigSchema { * Path of the djvutxt DJVU text extraction utility * Enable this and $wgDjvuDump to enable text layer extraction from djvu files * example: $wgDjvuTxt = 'djvutxt'; + * + * If this is set, {@link self::ShellboxShell} must be set to the correct + * shell path. */ public const DjvuTxt = [ 'default' => null, @@ -12397,6 +12399,20 @@ class MainConfigSchema { 'type' => '?string', ]; + /** + * The POSIX-compatible shell to use when running scripts. This is used by + * some media handling shell commands. + * + * If ShellboxUrls is configured, this path should exist on the remote side. + * On Windows this should be the full path to bash.exe, not git-bash.exe. + * + * @since 1.42 + */ + public const ShellboxShell = [ + 'default' => '/bin/sh', + 'type' => '?string', + ]; + // endregion -- end Shell and process control /***************************************************************************/ diff --git a/includes/config-schema.php b/includes/config-schema.php index 80debe91fa6a..f905e3271d19 100644 --- a/includes/config-schema.php +++ b/includes/config-schema.php @@ -318,7 +318,6 @@ return [ 'ResponsiveImages' => true, 'ImagePreconnect' => false, 'DjvuUseBoxedCommand' => false, - 'DjvuShell' => '/bin/sh', 'DjvuDump' => null, 'DjvuRenderer' => null, 'DjvuTxt' => null, @@ -2457,6 +2456,7 @@ return [ 'default' => null, ], 'ShellboxSecretKey' => null, + 'ShellboxShell' => '/bin/sh', 'HTTPTimeout' => 25, 'HTTPConnectTimeout' => 5.0, 'HTTPMaxTimeout' => 0, @@ -2613,10 +2613,6 @@ return [ ], 'UploadThumbnailRenderMap' => 'object', 'GalleryOptions' => 'object', - 'DjvuShell' => [ - 0 => 'string', - 1 => 'null', - ], 'DjvuDump' => [ 0 => 'string', 1 => 'null', @@ -2961,6 +2957,10 @@ return [ 0 => 'string', 1 => 'null', ], + 'ShellboxShell' => [ + 0 => 'string', + 1 => 'null', + ], 'HTTPTimeout' => 'number', 'HTTPConnectTimeout' => 'number', 'HTTPMaxTimeout' => 'number', diff --git a/includes/media/DjVuImage.php b/includes/media/DjVuImage.php index 14fac90bc236..907fa214d03b 100644 --- a/includes/media/DjVuImage.php +++ b/includes/media/DjVuImage.php @@ -254,96 +254,113 @@ class DjVuImage { $djvuDump = $config->get( MainConfigNames::DjvuDump ); $djvuTxt = $config->get( MainConfigNames::DjvuTxt ); $djvuUseBoxedCommand = $config->get( MainConfigNames::DjvuUseBoxedCommand ); - $djvuShell = $config->get( MainConfigNames::DjvuShell ); + $shell = $config->get( MainConfigNames::ShellboxShell ); if ( !$this->isValid() ) { return false; } - $retval = null; + if ( $djvuTxt === null && $djvuDump === null ) { + return []; + } + + $txt = null; + $dump = null; if ( $djvuUseBoxedCommand ) { $command = MediaWikiServices::getInstance()->getShellCommandFactory() - ->createBoxed( 'media' ) + ->createBoxed( 'djvu' ) ->disableNetwork() ->firejailDefaultSeccomp() - ->routeName( 'djvu-metadata' ); - $command - ->params( $djvuShell, 'scripts/retrieveDjvuMetaData.sh' ) + ->routeName( 'djvu-metadata' ) + ->params( $shell, 'scripts/retrieveDjvuMetaData.sh' ) ->inputFileFromFile( 'scripts/retrieveDjvuMetaData.sh', __DIR__ . '/scripts/retrieveDjvuMetaData.sh' ) ->inputFileFromFile( 'file.djvu', $this->mFilename ) ->memoryLimit( self::DJVUTXT_MEMORY_LIMIT ); $env = []; - if ( isset( $djvuDump ) ) { + if ( $djvuDump !== null ) { $env['DJVU_DUMP'] = $djvuDump; $command->outputFileToString( 'dump' ); - $result = $command->environment( $env )->execute(); } - if ( isset( $djvuTxt ) ) { + if ( $djvuTxt !== null ) { $env['DJVU_TXT'] = $djvuTxt; $command->outputFileToString( 'txt' ); - $command->outputFileToString( 'txt_exit_code' ); } $result = $command ->environment( $env ) ->execute(); - if ( isset( $djvuDump ) ) { - $dump = $result->getFileContents( 'dump' ); - $json = $this->convertDumpToJSON( $dump ); - } else { - $json = null; + if ( $result->getExitCode() !== 0 ) { + wfDebug( 'retrieveDjvuMetaData failed with exit code ' . $result->getExitCode() ); + return false; } - if ( isset( $djvuTxt ) ) { - $retval = (int)trim( $result->getFileContents( 'txt_exit_code' ) ); - $txt = $result->getFileContents( 'txt' ); + if ( $djvuDump !== null ) { + if ( $result->wasReceived( 'dump' ) ) { + $dump = $result->getFileContents( 'dump' ); + } else { + wfDebug( __METHOD__ . ": did not receive dump file" ); + } + } + + if ( $djvuTxt !== null ) { + if ( $result->wasReceived( 'txt' ) ) { + $txt = $result->getFileContents( 'txt' ); + } else { + wfDebug( __METHOD__ . ": did not receive text file" ); + } } } else { // No boxedcommand - if ( isset( $djvuDump ) ) { + if ( $djvuDump !== null ) { # djvudump is faster than djvutoxml (now abandoned) as of version 3.5 # https://sourceforge.net/p/djvu/bugs/71/ $cmd = Shell::escape( $djvuDump ) . ' ' . Shell::escape( $this->mFilename ); $dump = wfShellExec( $cmd ); - $json = [ 'data' => $this->convertDumpToJSON( $dump ) ]; - } else { - $json = null; } - if ( isset( $djvuTxt ) ) { + if ( $djvuTxt !== null ) { $cmd = Shell::escape( $djvuTxt ) . ' --detail=page ' . Shell::escape( $this->mFilename ); wfDebug( __METHOD__ . ": $cmd" ); $retval = 0; $txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] ); + if ( $retval !== 0 ) { + $txt = null; + } } } - $txt ??= ''; + # Convert dump to array + $json = []; + if ( $dump !== null ) { + $data = $this->convertDumpToJSON( $dump ); + if ( $data !== false ) { + $json = [ 'data' => $data ]; + } + } # Text layer - if ( isset( $djvuTxt ) ) { - $json['text'] = []; - if ( $retval === 0 ) { - # Strip some control characters - # Ignore carriage returns - $txt = preg_replace( "/\\\\013/", "", $txt ); - # Replace runs of OCR region separators with a single extra line break - $txt = preg_replace( "/(?:\\\\(035|037))+/", "\n", $txt ); - - $reg = <<<EOR - /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*" - ((?> # Text to match is composed of atoms of either: - \\\\. # - any escaped character - | # - any character different from " and \ - [^"\\\\]+ - )*?) - "\s*\) - | # Or page can be empty ; in this case, djvutxt dumps () - \(\s*()\)/sx + if ( $txt !== null ) { + # Strip some control characters + # Ignore carriage returns + $txt = preg_replace( "/\\\\013/", "", $txt ); + # Replace runs of OCR region separators with a single extra line break + $txt = preg_replace( "/(?:\\\\(035|037))+/", "\n", $txt ); + + $reg = <<<EOR + /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*" + ((?> # Text to match is composed of atoms of either: + \\\\. # - any escaped character + | # - any character different from " and \ + [^"\\\\]+ + )*?) + "\s*\) + | # Or page can be empty ; in this case, djvutxt dumps () + \(\s*()\)/sx EOR; - $matches = []; - preg_match_all( $reg, $txt, $matches ); - $json['text'] = array_map( [ $this, 'pageTextCallback' ], $matches[1] ); - } + $matches = []; + preg_match_all( $reg, $txt, $matches ); + $json['text'] = array_map( [ $this, 'pageTextCallback' ], $matches[1] ); + } else { + $json['text'] = []; } return $json; diff --git a/includes/media/scripts/retrieveDjvuMetaData.sh b/includes/media/scripts/retrieveDjvuMetaData.sh index 601b20a97d4c..5ca0fb6a33aa 100644 --- a/includes/media/scripts/retrieveDjvuMetaData.sh +++ b/includes/media/scripts/retrieveDjvuMetaData.sh @@ -1,23 +1,32 @@ #!/bin/sh -# Get parameters from environment -export DJVU_DUMP="${DJVU_DUMP:-djvudump}" -export DJVU_TXT="${DJVU_TXT:-djvutxt}" + runDump() { + local ret # djvudump is faster than djvutoxml (now abandoned) as of version 3.5 # https://sourceforge.net/p/djvu/bugs/71/ "$DJVU_DUMP" file.djvu > dump + ret="$?" + if [ "$ret" -ne 0 ]; then + echo "djvudump failed with exit code $ret" 1>&2 + rm -f dump + fi } runTxt() { + local ret # Text layer - "$DJVU_TXT" \ - --detail=page \ - file.djvu > txt - # Store exit code so we can use it later - echo $? > txt_exit_code + if ! "$DJVU_TXT" --detail=page file.djvu > txt; then + rm -f txt + fi + ret="$?" + if [ "$ret" -ne 0 ]; then + echo "djvutxt failed with exit code $ret" 1>&2 + rm -f txt + fi } -if [ -x "$DJVU_DUMP" ]; then +if [ -n "$DJVU_DUMP" ]; then runDump fi -if [ -x "$DJVU_TXT" ]; then +if [ -n "$DJVU_TXT" ]; then runTxt fi +exit 0 |