Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mask control characters in filenames #118

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1536,6 +1536,8 @@ if(HAVE_DECODERS AND (NOT MSVC OR MSVC_VERSION GREATER_EQUAL 1900))
src/common/sysdefs.h
src/common/tuklib_common.h
src/common/tuklib_config.h
src/common/tuklib_mbstr_nonprint.c
src/common/tuklib_mbstr_nonprint.h
src/common/tuklib_exit.c
src/common/tuklib_exit.h
src/common/tuklib_gettext.h
Expand Down Expand Up @@ -1565,6 +1567,7 @@ if(HAVE_DECODERS AND (NOT MSVC OR MSVC_VERSION GREATER_EQUAL 1900))
endif()

tuklib_progname("${XZDEC}")
tuklib_mbstr("${XZDEC}")

install(TARGETS "${XZDEC}"
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
Expand Down Expand Up @@ -1592,6 +1595,8 @@ if(HAVE_DECODERS AND (NOT MSVC OR MSVC_VERSION GREATER_EQUAL 1900))
src/common/sysdefs.h
src/common/tuklib_common.h
src/common/tuklib_config.h
src/common/tuklib_mbstr_nonprint.c
src/common/tuklib_mbstr_nonprint.h
src/common/tuklib_exit.c
src/common/tuklib_exit.h
src/common/tuklib_gettext.h
Expand All @@ -1616,6 +1621,7 @@ if(HAVE_DECODERS AND (NOT MSVC OR MSVC_VERSION GREATER_EQUAL 1900))
endif()

tuklib_progname(lzmainfo)
tuklib_mbstr(lzmainfo)

# NOTE: The translations are in the "xz" domain and the .mo files are
# installed as part of the "xz" target.
Expand Down Expand Up @@ -1649,6 +1655,8 @@ if(NOT MSVC OR MSVC_VERSION GREATER_EQUAL 1900)
src/common/sysdefs.h
src/common/tuklib_common.h
src/common/tuklib_config.h
src/common/tuklib_mbstr_nonprint.c
src/common/tuklib_mbstr_nonprint.h
src/common/tuklib_exit.c
src/common/tuklib_exit.h
src/common/tuklib_gettext.h
Expand Down
2 changes: 2 additions & 0 deletions THANKS
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ has been important. :-) In alphabetical order:
- Tomer Chachamu
- Vitaly Chikunov
- Antoine Cœur
- Ryan Colyer
- Gabi Davar
- İhsan Doğan
- Chris Donawa
Expand Down Expand Up @@ -132,6 +133,7 @@ has been important. :-) In alphabetical order:
- Bernhard Reutner-Fischer
- Markus Rickert
- Cristian Rodríguez
- Jeroen Roovers
- Christian von Roques
- Boud Roukema
- Torsten Rupp
Expand Down
3 changes: 0 additions & 3 deletions TODO
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ Known bugs
compress extremely well, so going from compression ratio of 0.003
to 0.004 means big relative increase in the compressed file size.

xz doesn't quote unprintable characters when it displays file names
given on the command line.

tuklib_exit() doesn't block signals => EINTR is possible.

If liblzma has created threads and fork() gets called, liblzma
Expand Down
2 changes: 2 additions & 0 deletions src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ EXTRA_DIST = \
common/tuklib_integer.h \
common/tuklib_mbstr_fw.c \
common/tuklib_mbstr.h \
common/tuklib_mbstr_nonprint.c \
common/tuklib_mbstr_nonprint.h \
common/tuklib_mbstr_width.c \
common/tuklib_open_stdxxx.c \
common/tuklib_open_stdxxx.h \
Expand Down
139 changes: 139 additions & 0 deletions src/common/tuklib_mbstr_nonprint.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// SPDX-License-Identifier: 0BSD

///////////////////////////////////////////////////////////////////////////////
//
/// \file tuklib_mbstr_nonprint.c
/// \brief Find and replace non-printable characters with question marks
//
// Author: Lasse Collin
//
///////////////////////////////////////////////////////////////////////////////

#include "tuklib_mbstr_nonprint.h"
#include <stdlib.h>
#include <string.h>

#ifdef HAVE_MBRTOWC
# include <wchar.h>
# include <wctype.h>
#else
# include <ctype.h>
#endif


static bool
is_next_printable(const char *str, size_t len, size_t *next_len)
{
#ifdef HAVE_MBRTOWC
// This cheats slightly and assumes that character sets with
// locking shifts aren't used. I couldn't find info about even
// ancient use of such charsets on POSIX-like systems.
mbstate_t ps;
memset(&ps, 0, sizeof(ps));

wchar_t wc;
*next_len = mbrtowc(&wc, str, len, &ps);

if (*next_len == (size_t)-2) {
// Incomplete multibyte sequence: Treat the whole sequence
// as a single non-printable multibyte character that ends
// the string.
*next_len = len;
return false;
}

// Check more broadly than just ret == (size_t)-1 to be safe
// in case mbrtowc() returns something weird. This check
// covers (size_t)-1 (that is, SIZE_MAX) too because len is from
// strlen() and the terminating '\0' isn't part of the length.
if (*next_len < 1 || *next_len > len) {
// Invalid multibyte sequence: Treat the first byte as
// a non-printable single-byte character. Decoding will
// be restarted from the next byte on the next call to
// this function.
*next_len = 1;
return false;
}

return iswprint((wint_t)wc) != 0;
#else
(void)len;
*next_len = 1;
return isprint((unsigned char)str[0]) != 0;
#endif
}


static bool
has_nonprint(const char *str, size_t len)
{
for (size_t i = 0; i < len; ) {
size_t next_len;
if (!is_next_printable(str + i, len - i, &next_len))
return true;

i += next_len;
}

return false;
}


extern bool
tuklib_has_nonprint(const char *str)
{
return has_nonprint(str, strlen(str));
}


extern const char *
tuklib_mask_nonprint_r(const char *str, char **mem)
{
// Free the old string, if any.
free(*mem);
*mem = NULL;

// If the whole input string contains only printable characters,
// return the input string.
const size_t len = strlen(str);
if (!has_nonprint(str, len))
return str;

// Allocate memory for the masked string. Since we use the single-byte
// character '?' to mask non-printable characters, it's possible that
// a few bytes less memory would be needed in reality if multibyte
// characters are masked.
//
// If allocation fails, return "???" because it should be safer than
// returning the unmasked string.
*mem = malloc(len + 1);
if (*mem == NULL)
return "???";

// Replace all non-printable characters with '?'.
char *dest = *mem;

for (size_t i = 0; i < len; ) {
size_t next_len;
if (is_next_printable(str + i, len - i, &next_len)) {
memcpy(dest, str + i, next_len);
dest += next_len;
} else {
*dest++ = '?';
}

i += next_len;
}

*dest = '\0';

return *mem;
}


extern const char *
tuklib_mask_nonprint(const char *str)
{
static char *mem = NULL;
return tuklib_mask_nonprint_r(str, &mem);
}
76 changes: 76 additions & 0 deletions src/common/tuklib_mbstr_nonprint.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// SPDX-License-Identifier: 0BSD

///////////////////////////////////////////////////////////////////////////////
//
/// \file tuklib_mbstr_nonprint.h
/// \brief Find and replace non-printable characters with question marks
///
/// If mbrtowc(3) is available, it and iswprint(3) is used to check if all
/// characters are printable. Otherwise single-byte character set is assumed
/// and isprint(3) is used.
///
/// It is assumed that such locales don't exist which use a character set
/// with locked shift states. This should be a safe assumption even on
/// ancient legacy systems at least if they are remotely POSIX-like.
/// That is, multibyte character sets like UTF-8, EUC-JP, Shift-JIS, Big5,
/// and GB18030 are compatible with this implementation.
//
// Author: Lasse Collin
//
///////////////////////////////////////////////////////////////////////////////

#ifndef TUKLIB_MBSTR_NONPRINT_H
#define TUKLIB_MBSTR_NONPRINT_H

#include "tuklib_common.h"
TUKLIB_DECLS_BEGIN

#define tuklib_has_nonprint TUKLIB_SYMBOL(tuklib_has_nonprint)
extern bool tuklib_has_nonprint(const char *str);
///<
/// \brief Check if a string contains any non-printable characters
///
/// \return false if str contains only valid multibyte characters and
/// iswprint(3) returns non-zero for all of them; true otherwise
///
/// \note In case mbrtowc(3) isn't available, single-byte character set
/// is assumed and isprint(3) is used instead of iswprint(3).


#define tuklib_mask_nonprint_r TUKLIB_SYMBOL(tuklib_mask_nonprint_r)
extern const char *tuklib_mask_nonprint_r(const char *str, char **mem);
///<
/// \brief Replace non-printable characters with question marks
///
/// \param str Untrusted string, for example, a filename
/// \param mem This function will always call free(*mem) to free
/// the old allocation. Before the first call, *mem should
/// be initialized to NULL. If this function needs to
/// allocate memory for a modified string, a pointer
/// to the allocated memory will be stored to *mem.
/// Otherwise *mem will remain NULL.
///
/// \return If tuklib_has_nonprint(str) returns false, this function
/// returns str. Otherwise memory is allocated to hold a modified
/// string and a pointer to that is returned. The pointer to the
/// allocated memory is also stored to *mem. A modified string
/// has the problematic characters replaced by '?'. If memory
/// allocation fails, "???" is returned and *mem is NULL.

#define tuklib_mask_nonprint TUKLIB_SYMBOL(tuklib_mask_nonprint)
extern const char *tuklib_mask_nonprint(const char *str);
///<
/// \brief Replace non-printable characters with question marks
///
/// This is a convenience function for single-threaded use. This calls
/// tuklib_mask_nonprint_r() using an internal static variable to hold
/// the possible allocation.
///
/// \param str Untrusted string, for example, a filename
///
/// \return See tuklib_mask_nonprint_r().
///
/// \note This function is not thread safe!

TUKLIB_DECLS_END
#endif
1 change: 1 addition & 0 deletions src/lzmainfo/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ bin_PROGRAMS = lzmainfo
lzmainfo_SOURCES = \
lzmainfo.c \
../common/tuklib_progname.c \
../common/tuklib_mbstr_nonprint.c \
../common/tuklib_exit.c

if COND_W32
Expand Down
16 changes: 10 additions & 6 deletions src/lzmainfo/lzmainfo.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "getopt.h"
#include "tuklib_gettext.h"
#include "tuklib_progname.h"
#include "tuklib_mbstr_nonprint.h"
#include "tuklib_exit.h"

#ifdef TUKLIB_DOSLIKE
Expand Down Expand Up @@ -104,7 +105,8 @@ lzmainfo(const char *name, FILE *f)
uint8_t buf[13];
const size_t size = fread(buf, 1, sizeof(buf), f);
if (size != 13) {
fprintf(stderr, "%s: %s: %s\n", progname, name,
fprintf(stderr, "%s: %s: %s\n", progname,
tuklib_mask_nonprint(name),
ferror(f) ? strerror(errno)
: _("File is too small to be a .lzma file"));
return true;
Expand All @@ -118,7 +120,8 @@ lzmainfo(const char *name, FILE *f)
break;

case LZMA_OPTIONS_ERROR:
fprintf(stderr, "%s: %s: %s\n", progname, name,
fprintf(stderr, "%s: %s: %s\n", progname,
tuklib_mask_nonprint(name),
_("Not a .lzma file"));
return true;

Expand All @@ -142,7 +145,7 @@ lzmainfo(const char *name, FILE *f)
// this output and we don't want to break that when people move
// from LZMA Utils to XZ Utils.
if (f != stdin)
printf("%s\n", name);
printf("%s\n", tuklib_mask_nonprint(name));

printf("Uncompressed size: ");
if (uncompressed_size == UINT64_MAX)
Expand Down Expand Up @@ -201,9 +204,10 @@ main(int argc, char **argv)
if (f == NULL) {
ret = EXIT_FAILURE;
fprintf(stderr, "%s: %s: %s\n",
progname,
argv[optind],
strerror(errno));
progname,
tuklib_mask_nonprint(
argv[optind]),
strerror(errno));
continue;
}

Expand Down
1 change: 1 addition & 0 deletions src/xz/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ xz_SOURCES = \
../common/tuklib_open_stdxxx.c \
../common/tuklib_progname.c \
../common/tuklib_exit.c \
../common/tuklib_mbstr_nonprint.c \
../common/tuklib_mbstr_width.c \
../common/tuklib_mbstr_fw.c

Expand Down
Loading
Loading