Skip to content
This repository was archived by the owner on Aug 16, 2023. It is now read-only.

Commit 6b34650

Browse files
hhaenselstaticfloat
authored andcommitted
Extended workaround for handling symlinks in tar files (#163)
* Extended workaround for tars containing symlinks in environments without symlink support * Remove earlier windows workaround * slight formatting correction * corrected tar command for verbose listing * another correction of the tar command * Adapted regex to match tar listing of mac-os * obtain debug info for tuning the regex * More debug info * Adapt regex for Mac OS, next version * New version of regex to match more tars * Determine tarlist format from a demo file listing. Slight change to `gen_7z` in order to avoid the "world-age problem" * slight modification to avoid changes to gen_7z * define symlink_parser locally * Replace old copyderef by new version, use mktemp() for temp files, document regex, modify excludeoption to also accept files with spaces * added examples of verbose listings, put back `mkpath(dest)` (which was accidentilly removed) and moved it to the correct location above `probe_symlink_creation(dest)` (was wrong before!) * Added 7z Listing and corrected tar exclude command * brushing up the code and making excludlist::Union{AbstractString, Cmd} and defaulting it to `` * excludelist defaults to nothing; adaptations for tar command in Windows 10: option "-f -" for stdin/stdout, removed \r in parse_tar_list * Change type of excludelist argument to `Union{AbstractString, Nothing}` * Change type of excludelist argument to `Union{AbstractString, Nothing}` * Correct some errors... * place an error in case that `probe_platform_engines!()` has not been run
1 parent a306d9c commit 6b34650

File tree

3 files changed

+180
-72
lines changed

3 files changed

+180
-72
lines changed

src/PlatformEngines.jl

Lines changed: 150 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,17 @@ gen_download_cmd = (url::AbstractString, out_path::AbstractString) ->
1919
error("Call `probe_platform_engines()` before `gen_download_cmd()`")
2020

2121
"""
22-
gen_unpack_cmd(tarball_path::AbstractString, out_path::AbstractString)
22+
gen_unpack_cmd(tarball_path::AbstractString, out_path::AbstractString; excludelist::Union{AbstractString, Nothing} = nothing)
2323
2424
Return a `Cmd` that will unpack the given `tarball_path` into the given
2525
`out_path`. If `out_path` is not already a directory, it will be created.
26+
excludlist is an optional file which contains a list of files that is not unpacked
27+
This option is mainyl used to exclude symlinks from extraction (see: `copyderef`)
2628
2729
This method is initialized by `probe_platform_engines()`, which should be
2830
automatically called upon first import of `BinaryProvider`.
2931
"""
30-
gen_unpack_cmd = (tarball_path::AbstractString, out_path::AbstractString) ->
32+
gen_unpack_cmd = (tarball_path::AbstractString, out_path::AbstractString; excludelist::Union{AbstractString, Nothing} = nothing) ->
3133
error("Call `probe_platform_engines()` before `gen_unpack_cmd()`")
3234

3335
"""
@@ -130,11 +132,6 @@ function probe_symlink_creation(dest::AbstractString)
130132
end
131133
end
132134

133-
# Global variable that tells us whether tempdir() can have symlinks
134-
# created within it.
135-
tempdir_symlink_creation = false
136-
137-
138135
"""
139136
probe_platform_engines!(;verbose::Bool = false)
140137
@@ -168,14 +165,7 @@ If `verbose` is `true`, print out the various engines as they are searched.
168165
function probe_platform_engines!(;verbose::Bool = false)
169166
global gen_download_cmd, gen_list_tarball_cmd, gen_package_cmd
170167
global gen_unpack_cmd, parse_tarball_listing, gen_sh_cmd
171-
global tempdir_symlink_creation
172-
173-
# First things first, determine whether tempdir() can have symlinks created
174-
# within it. This is important for our copyderef workaround for e.g. SMBFS
175-
tempdir_symlink_creation = probe_symlink_creation(tempdir())
176-
if verbose
177-
@info("Symlinks allowed in $(tempdir()): $(tempdir_symlink_creation)")
178-
end
168+
global gen_symlink_parser
179169

180170
agent = "BinaryProvider.jl (https://github.com/JuliaPackaging/BinaryProvider.jl)"
181171
# download_engines is a list of (test_cmd, download_opts_functor)
@@ -193,18 +183,18 @@ function probe_platform_engines!(;verbose::Bool = false)
193183
# windows, so we create generator functions to spit back functors to invoke
194184
# the correct 7z given the path to the executable:
195185
unpack_7z = (exe7z) -> begin
196-
return (tarball_path, out_path) ->
197-
pipeline(`$exe7z x $(tarball_path) -y -so`,
198-
`$exe7z x -si -y -ttar -o$(out_path)`)
186+
return (tarball_path, out_path, excludelist = nothing) ->
187+
pipeline(`$exe7z x $(tarball_path) -y -so`,
188+
`$exe7z x -si -y -ttar -o$(out_path) $(excludelist == nothing ? [] : "-x@$(excludelist)")`)
199189
end
200190
package_7z = (exe7z) -> begin
201191
return (in_path, tarball_path) ->
202192
pipeline(`$exe7z a -ttar -so a.tar "$(joinpath(".",in_path,"*"))"`,
203193
`$exe7z a -si $(tarball_path)`)
204194
end
205195
list_7z = (exe7z) -> begin
206-
return (path) ->
207-
pipeline(`$exe7z x $path -so`, `$exe7z l -ttar -y -si`)
196+
return (path; verbose = false) ->
197+
pipeline(`$exe7z x $path -so`, `$exe7z l -ttar -y -si $(verbose ? ["-slt"] : [])`)
208198
end
209199

210200
# Tar is rather less verbose, and we don't need to search multiple places
@@ -214,31 +204,128 @@ function probe_platform_engines!(;verbose::Bool = false)
214204
# package_opts_functor, list_opts_functor, parse_functor). The probulator
215205
# will check each of them by attempting to run `$test_cmd`, and if that
216206
# works, will set the global compression functions appropriately.
217-
gen_7z = (p) -> (unpack_7z(p), package_7z(p), list_7z(p), parse_7z_list)
207+
208+
# the regex at the last position is meant for parsing the symlinks from verbose 7z-listing
209+
# "Path = ([^\r\n]+)\r?\n" matches the symlink name which is followed by an optional return and a new line
210+
# (?:[^\r\n]+\r?\n)+ = a group of non-empty lines (information belonging to one file is written as a block of lines followed by an empty line)
211+
# more info on regex and a powerful online tester can be found at https://regex101.com
212+
# Symbolic Link = ([^\r\n]+)"s) matches the source filename
213+
# Demo 7z listing of tar files:
214+
# 7-Zip [64] 16.04 : Copyright (c) 1999-2016 Igor Pavlov : 2016-10-04
215+
#
216+
#
217+
# Listing archive:
218+
# --
219+
# Path =
220+
# Type = tar
221+
# Code Page = UTF-8
222+
#
223+
# ----------
224+
# Path = .
225+
# Folder = +
226+
# Size = 0
227+
# Packed Size = 0
228+
# Modified = 2018-08-22 11:44:23
229+
# Mode = 0rwxrwxr-x
230+
# User = travis
231+
# Group = travis
232+
# Symbolic Link =
233+
# Hard Link =
234+
235+
# Path = .\lib\libpng.a
236+
# Folder = -
237+
# Size = 10
238+
# Packed Size = 0
239+
# Modified = 2018-08-22 11:44:51
240+
# Mode = 0rwxrwxrwx
241+
# User = travis
242+
# Group = travis
243+
# Symbolic Link = libpng16.a
244+
# Hard Link =
245+
#
246+
# Path = .\lib\libpng16.a
247+
# Folder = -
248+
# Size = 334498
249+
# Packed Size = 334848
250+
# Modified = 2018-08-22 11:44:49
251+
# Mode = 0rw-r--r--
252+
# User = travis
253+
# Group = travis
254+
# Symbolic Link =
255+
# Hard Link =
256+
gen_7z = (p) -> (unpack_7z(p), package_7z(p), list_7z(p), parse_7z_list, r"Path = ([^\r\n]+)\r?\n(?:[^\r\n]+\r?\n)+Symbolic Link = ([^\r\n]+)"s)
218257
compression_engines = Tuple[]
219258

259+
(tmpfile, io) = mktemp()
260+
write(io, "Demo file for tar listing (Julia package BinaryProvider.jl)")
261+
close(io)
262+
220263
for tar_cmd in [`tar`, `busybox tar`]
264+
# try to determine the tar list format
265+
local symlink_parser
266+
try
267+
# Windows 10 now has a `tar` but it needs the `-f -` flag to use stdin/stdout
268+
# The Windows 10 tar does not work on substituted drives (`subst U: C:\Users`)
269+
# If a drive letter is part of the filename, then tar spits out a warning on stderr:
270+
# "tar: Removing leading drive letter from member names" - but it works properly
271+
tarListing = read(pipeline(`$tar_cmd -cf - $tmpfile`, `$tar_cmd -tvf -`), String)
272+
# obtain the text of the line before the filename
273+
m = match(Regex("((?:\\S+\\s+)+?)$tmpfile"), tarListing)[1]
274+
# count the number of words before the filename
275+
nargs = length(split(m, " "; keepempty = false))
276+
# build a regex for catching the symlink:
277+
# "^l" = line starting with l
278+
# "(?:\S+\s+){$nargs} = nargs non-capturing groups of many non-spaces "\S+" and many spaces "\s+"
279+
# "(.+?)" = a non-greedy sequence of characters: the symlink
280+
# "(?: -> (.+?))?" = an optional group of " -> " followed by a non-greedy sequence of characters: the source of the link
281+
# "\r?\$" = matches the end of line with an optional return character for some OSes
282+
# Demo listings
283+
# drwxrwxr-x 0 sabae sabae 0 Sep 5 2018 collapse_the_symlink/
284+
# lrwxrwxrwx 0 sabae sabae 0 Sep 5 2018 collapse_the_symlink/foo -> foo.1
285+
# -rw-rw-r-- 0 sabae sabae 0 Sep 5 2018 collapse_the_symlink/foo.1
286+
# lrwxrwxrwx 0 sabae sabae 0 Sep 5 2018 collapse_the_symlink/foo.1.1 -> foo.1
287+
# lrwxrwxrwx 0 sabae sabae 0 Sep 5 2018 collapse_the_symlink/broken -> obviously_broken
288+
#
289+
# drwxrwxr-x sabae/sabae 0 2018-09-05 18:19 collapse_the_symlink/
290+
# lrwxrwxrwx sabae/sabae 0 2018-09-05 18:19 collapse_the_symlink/foo -> foo.1
291+
#
292+
# lrwxrwxr-x 1000/1000 498007696 2009-11-27 00:14:00 link1 -> source1
293+
# lrw-rw-r-- 1000/1000 1359020032 2019-06-03 12:02:03 link2 -> sourcedir/source2
294+
#
295+
# now a pathological link "2009 link with blanks"
296+
# this can only be tracked by determining the tar format beforehand:
297+
# lrw-rw-r-- 0 1000 1000 1359020032 Jul 8 2009 2009 link with blanks -> target with blanks
298+
symlink_parser = Regex("^l(?:\\S+\\s+){$nargs}(.+?)(?: -> (.+?))?\\r?\$", "m")
299+
catch
300+
# generic expression for symlink parsing
301+
# this will fail, if the symlink contains space characters (which is highly improbable, though)
302+
# "^l.+?" = a line starting with an "l" followed by a sequence of non-greedy characters
303+
# \S+? the filename consisting of non-space characters, the rest as above
304+
symlink_parser = r"^l.+? (\S+?)(?: -> (.+?))?\r?$"m
305+
end
221306
# Some tar's aren't smart enough to auto-guess decompression method. :(
222-
unpack_tar = (tarball_path, out_path) -> begin
307+
unpack_tar = (tarball_path, out_path, excludelist = nothing) -> begin
223308
Jjz = "z"
224309
if endswith(tarball_path, ".xz")
225310
Jjz = "J"
226311
elseif endswith(tarball_path, ".bz2")
227312
Jjz = "j"
228313
end
229-
return `$tar_cmd -x$(Jjz)f $(tarball_path) --directory=$(out_path)`
314+
return `$tar_cmd -x$(Jjz)f $(tarball_path) --directory=$(out_path) $(excludelist == nothing ? [] : "--exclude-from=$(excludelist)")`
230315
end
231316
package_tar = (in_path, tarball_path) ->
232317
`$tar_cmd -czvf $tarball_path -C $(in_path) .`
233-
list_tar = (in_path) -> `$tar_cmd -tzf $in_path`
318+
list_tar = (in_path; verbose = false) -> `$tar_cmd $(verbose ? "-tzvf" : "-tzf") $in_path`
234319
push!(compression_engines, (
235320
`$tar_cmd --help`,
236321
unpack_tar,
237322
package_tar,
238323
list_tar,
239324
parse_tar_list,
325+
symlink_parser
240326
))
241327
end
328+
rm(tmpfile, force = true)
242329

243330
# sh_engines is just a list of Cmds-as-paths
244331
sh_engines = [
@@ -347,13 +434,14 @@ function probe_platform_engines!(;verbose::Bool = false)
347434
end
348435

349436
# Search for a compression engine
350-
for (test, unpack, package, list, parse) in compression_engines
437+
for (test, unpack, package, list, parse, parse_symlinks) in compression_engines
351438
if probe_cmd(`$test`; verbose=verbose)
352439
# Set our compression command generators
353440
gen_unpack_cmd = unpack
354441
gen_package_cmd = package
355442
gen_list_tarball_cmd = list
356443
parse_tarball_listing = parse
444+
gen_symlink_parser = parse_symlinks
357445

358446
if verbose
359447
@info("Found compression engine $(test.exec[1])")
@@ -459,6 +547,11 @@ used by `list_tarball_files`.
459547
"""
460548
function parse_tar_list(output::AbstractString)
461549
lines = [chomp(l) for l in split(output, "\n")]
550+
for idx in 1:length(lines)
551+
if endswith(lines[idx], '\r')
552+
lines[idx] = lines[idx][1:end-1]
553+
end
554+
end
462555

463556
# Drop empty lines and and directories
464557
lines = [l for l in lines if !isempty(l) && !endswith(l, '/')]
@@ -630,25 +723,31 @@ Unpack tarball located at file `tarball_path` into directory `dest`.
630723
"""
631724
function unpack(tarball_path::AbstractString, dest::AbstractString;
632725
verbose::Bool = false)
633-
# The user can force usage of our dereferencing workaround for filesystems
726+
727+
# unpack into dest
728+
mkpath(dest)
729+
730+
# The user can force usage of our dereferencing workarounds for filesystems
634731
# that don't support symlinks, but it is also autodetected.
635-
copyderef = get(ENV, "BINARYPROVIDER_COPYDEREF", "") == "true" ||
636-
(tempdir_symlink_creation && !probe_symlink_creation(dest))
637-
638-
# If we should "copyderef" what we do is to unpack into a temporary directory,
639-
# then copy without preserving symlinks into the destination directory. This
640-
# is to work around filesystems that are mounted (such as SMBFS filesystems)
641-
# that do not support symlinks. Note that this does not work if you are on
642-
# a system that completely disallows symlinks (Even within temporary
643-
# directories) such as Windows XP/7.
644-
true_dest = dest
732+
copyderef = (get(ENV, "BINARYPROVIDER_COPYDEREF", "") == "true") || !probe_symlink_creation(dest)
733+
734+
# If we should "copyderef" what we do is to unpack everything except symlinks
735+
# then copy the sources of the symlinks to the destination of the symlink instead.
736+
# This is to work around filesystems that are mounted (such as SMBFS filesystems)
737+
# that do not support symlinks.
738+
739+
excludelist = nothing
740+
645741
if copyderef
646-
dest = mktempdir()
742+
symlinks = list_tarball_symlinks(tarball_path)
743+
if length(symlinks) > 0
744+
(excludelist, io) = mktemp()
745+
write(io, join([s[1] for s in symlinks], "\n"))
746+
close(io)
747+
end
647748
end
648749

649-
# unpack into dest
650-
mkpath(dest)
651-
oc = OutputCollector(gen_unpack_cmd(tarball_path, dest); verbose=verbose)
750+
oc = OutputCollector(gen_unpack_cmd(tarball_path, dest, excludelist); verbose=verbose)
652751
try
653752
if !wait(oc)
654753
error()
@@ -660,34 +759,19 @@ function unpack(tarball_path::AbstractString, dest::AbstractString;
660759
error("Could not unpack $(tarball_path) into $(dest)")
661760
end
662761

663-
if copyderef
664-
# We would like to use `cptree(; follow_symlinks=false)` here, but it
665-
# freaks out if there are any broken symlinks, which is too finnicky
666-
# for our use cases. For us, we will just print a warning and continue.
667-
function cptry_harder(src, dst)
668-
mkpath(dst)
669-
for name in readdir(src)
670-
srcname = joinpath(src, name)
671-
dstname = joinpath(dst, name)
672-
if isdir(srcname)
673-
cptry_harder(srcname, dstname)
674-
else
675-
try
676-
Base.Filesystem.sendfile(srcname, dstname)
677-
catch e
678-
if isa(e, Base.IOError)
679-
if verbose
680-
@warn("Could not copy $(srcname) to $(dstname)")
681-
end
682-
else
683-
rethrow(e)
684-
end
685-
end
686-
end
762+
if copyderef && length(symlinks) > 0
763+
@info("Replacing symlinks in tarball by their source files ...\n" * join(string.(symlinks),"\n"))
764+
for s in symlinks
765+
sourcefile = joinpath(dest, replace(s[2], r"(?:\.[\\/])(.*)" => s"\1"))
766+
destfile = joinpath(dest, replace(s[1], r"(?:\.[\\/])(.*)" => s"\1"))
767+
768+
if isfile(sourcefile)
769+
cp(sourcefile, destfile, force = true)
770+
else
771+
@warn("Symlink source '$sourcefile' does not exist!")
687772
end
688773
end
689-
cptry_harder(dest, true_dest)
690-
rm(dest; recursive=true, force=true)
774+
rm(excludelist; force = true)
691775
end
692776
end
693777

src/Prefix.jl

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ using SHA
77
export Prefix, bindir, libdir, includedir, logdir, activate, deactivate,
88
extract_name_version_platform_key, extract_platform_key, isinstalled,
99
install, uninstall, manifest_from_url, manifest_for_file,
10-
list_tarball_files, verify, temp_prefix, package
10+
list_tarball_files, list_tarball_symlinks, verify, temp_prefix, package
1111

1212

1313
# Temporary hack around https://github.com/JuliaLang/julia/issues/26685
@@ -470,6 +470,30 @@ function list_tarball_files(path::AbstractString; verbose::Bool = false)
470470
return parse_tarball_listing(collect_stdout(oc))
471471
end
472472

473+
"""
474+
list_tarball_symlinks(path::AbstractString; verbose::Bool = false)
475+
476+
Given a `.tar.gz` filepath, return a dictionary of symlinks in the archive
477+
"""
478+
function list_tarball_symlinks(tarball_path::AbstractString; verbose::Bool = false)
479+
if !isdefined(BinaryProvider, :gen_symlink_parser)
480+
error("Call `probe_platform_engines!()` before `list_tarball_symlinks()`")
481+
end
482+
oc = OutputCollector(gen_list_tarball_cmd(tarball_path; verbose = true); verbose = verbose)
483+
try
484+
if !wait(oc)
485+
error()
486+
end
487+
catch
488+
error("Could not list contents of tarball $(tarball_path)")
489+
end
490+
output = collect_stdout(oc)
491+
492+
mm = [m.captures for m in eachmatch(gen_symlink_parser, output)]
493+
symlinks = [m[1] => joinpath(splitdir(m[1])[1], split(m[2], "/")...) for m in mm]
494+
return symlinks
495+
end
496+
473497
"""
474498
verify(path::AbstractString, hash::AbstractString;
475499
verbose::Bool = false, report_cache_status::Bool = false)

0 commit comments

Comments
 (0)