This commit is contained in:
Denis Knauf 2021-09-02 23:18:31 +02:00
commit 3cc57e2e78
10 changed files with 638 additions and 0 deletions

6
.gitignore vendored Normal file
View file

@ -0,0 +1,6 @@
*.sw[opn]
*.so
*.o
Makefile
/tmp
/pkg

4
Gemfile Normal file
View file

@ -0,0 +1,4 @@
source 'https://rubygems.org'
gem 'ffi'
gem 'sqlite3'
gem 'rake-compiler'

19
Gemfile.lock Normal file
View file

@ -0,0 +1,19 @@
GEM
remote: https://rubygems.org/
specs:
ffi (1.15.3)
rake (13.0.6)
rake-compiler (1.1.1)
rake
sqlite3 (1.4.2)
PLATFORMS
x86_64-linux
DEPENDENCIES
ffi
rake-compiler
sqlite3
BUNDLED WITH
2.2.25

21
Rakefile Normal file
View file

@ -0,0 +1,21 @@
# vim: set noet sw=2 ts=2 sts=2:
require 'rake'
require 'rake/extensiontask'
require 'bundler'
Rake::ExtensionTask.new( "deduperemoverb") do |extension|
extension.lib_dir = "ext"
end
task :chmod do
File.chmod 0775, 'ext/deduperemoverb.so'
end
task :clean do
File.unlink 'ext/deduperemoverb.so' if File.exists? 'ext/deduperemoverb.so'
end
task :build => [:clean, :compile, :chmod]
Bundler::GemHelper.install_tasks name: 'deduperemoverb'

33
bin/deduperemoverb.rb Executable file
View file

@ -0,0 +1,33 @@
#!/usr/bin/env ruby
# vim: set noet sw=2 ts=2 sts=2:
require 'deduperemoverb'
require 'ostruct'
require 'logger'
$logger = Logger.new STDERR
$logger.formatter =
proc do |severity, datetime, progname, message|
sprintf "%s %s %s\n", datetime.strftime( '%H:%M:%S.%6N'), severity[0], message
end
begin
raise "Argument 0: Database expected" if ARGV.empty?
dbfile = Pathname.new ARGV[0]
raise "Argument 0: Database missing: #{dbfile}" unless dbfile.exist?
tempfile = Pathname.new("/run/user/#{Process.uid}").join dbfile.basename
dr = Duperemove.new dbfile, tempfile
dr.run ARGV[1..-1]
dr.finish
rescue SystemExit
raise
rescue Interrupt
$logger.info "was interrupted"
exit 1
rescue
$logger.fatal "#$! (#{$!.class})\n#{$!.backtrace.join "\n\t"}"
end

28
deduperemoverb.gemspec Normal file
View file

@ -0,0 +1,28 @@
# vim: set noet sw=2 ts=2 sts=2:
require 'rake'
Gem::Specification.new do |s|
s.name = 'deduperemoverb'
s.version = '0.0.1'
s.licenses = %w[LGPLv3]
s.authors = 'Denis Knauf'
s.homepage = 'https://git.denkn.at/deac/deduperemoverb'
s.summary = 'Deduplication of file junks of found files by deduperemove.'
s.files =
FileList[
'lib/**/*.rb',
'bin/*',
'Gemfile',
'Gemfile.lock',
'ext/*/*.c',
'ext/*/extconf.rb'
]
s.require_paths = %w[lib]
s.extensions = %w[ext/deduperemoverb/extconf.rb]
s.add_development_dependency "rake", "~> 13"
s.add_dependency "ffi", '~> 1.15'
s.add_dependency "sqlite3", '~> 1.4'
#s.install_tasks name: 'compile'
end

View file

@ -0,0 +1,22 @@
#include <sys/ioctl.h>
#include <linux/fs.h>
#include <errno.h>
int fideduperange_errno_value = 0;
int fideduperange( int src_fd, struct file_dedupe_range *arg) {
fideduperange_errno_value = 0;
int r = ioctl( src_fd, FIDEDUPERANGE, arg);
fideduperange_errno_value = errno;
return r;
}
int fideduperange_errno() {
return fideduperange_errno_value;
}
void fideduperange_consts( int config[3]) {
config[0] = FIDEDUPERANGE;
config[1] = FILE_DEDUPE_RANGE_SAME;
config[2] = FILE_DEDUPE_RANGE_DIFFERS;
}

View file

@ -0,0 +1,3 @@
require "mkmf"
create_makefile "deduperemoverb"

359
lib/deduperemoverb.rb Executable file
View file

@ -0,0 +1,359 @@
#!/usr/bin/env ruby
# vim: set noet sw=2 ts=2 sts=2:
require 'sqlite3'
require 'pathname'
require 'ostruct'
require 'deduperemoverb/file_dedupe_range'
class SQLite3::ResultSet
def each_enum
loop do
row = @stmt.step
return nil if @stmt.done?
row = @db.translate_from_db @stmt.types, row
yield ArrayWithTypesAndFields.new( row)
end
end
end
def _recursive dir, &exe
Dir.each_child dir do |e|
next if '.' == e or '..' == e
e = File.join dir, e
if File.directory? e
_recursive e
elsif File.file? e
yield e
end
end
end
def recursive dir, &exe
return to_enum( __method__, dir) unless block_given?
dir = dir.to_s
if File.directory? dir
_recursive dir, &exe
elsif File.file? dir
yield dir
else
raise ArgumentError, "Directory [#{dir}] does not exist." if File.exist? dir
end
end
def hash_file path, chunksize, &exe
File.open path do |f|
b = f.pos
s = f.read chunksize
e = f.pos
unless chunksize == s.bytesize
if f.eof?
yield s, b
return
else
raise "Read lesser than chunksize, but did not reach end of file. [#{path}]"
end
end
yield s, b
end
return
end
def hash_recursive dir, &exe
recursive dir do |path|
ino = File::Stat.new( path).ino
hash_file path do |dgs, pos|
yield path, ino, dgs, pos
end
end
end
class Duperemove
class DFH
class OpenFailed < Exception
attr_reader :file, :error
def initialize file, error
@file, @error = file, error
super "Open file `#{file}` failed: #{error}"
end
end
attr_reader :digest, :filename, :ino, :subvol, :size, :offset, :file, :dedupe_seq, :last_dedupe_seq
def initialize digest, filename, ino, subvol, size, offset, dedupe_seq, last_dedupe_seq
@digest, @filename, @ino, @subvol, @size, @offset, @dedupe_seq, @last_dedupe_seq =
digest, filename, ino, subvol, size, offset, dedupe_seq, last_dedupe_seq
@file = nil
end
def open mode = nil
f = File.new @filename, mode || 'r+'
s = f.stat
unless s.ino == @ino
f.close
return OpenFailed.new( @filename, "Not same inode: actual file #{s.ino} <=> hashed #{@ino}")
end
unless s.size == @size # and s.subvol == @subvol
f.close
return OpenFailed.new( @filename, "Size differs: actual file #{s.size} <=> hashed #{@size}")
end
@file = f
self
rescue Exception
f.close
return OpenFailed.new( @filename, "#{$!.to_s} (#{$!.class.name})")
end
def close
@file.close if @file and not @file.closed?
end
end
def self.duperemove fd, offset, length, dupes
rr = FileDedupeRange[dupes.size].new offset, length
fds = []
dupes.each_with_index do |(fd,os),i|
fn = fd.is_a?(File) ? fd.fileno : fd
fds[fn] = fd
rr[:destinations][i][:fd] = fn
rr[:destinations][i][:offset] = os
end
fn = fd.is_a?(File) ? fd.fileno : fd
rv = FileDedupeRange.dedup fn, rr
raise FileDedupeRange.errno, "Deduplication failed for file" if -1 == rv
r = {}
rr[:destinations].each {|d| r[fds[d[:fd]]] = [d[:bytes_deduped], d[:status]] }
r
end
def per_digest &exe
return to_enum( __method__, db) unless block_given?
sql = <<-EOSQL
SELECT h.digest, count(1)
FROM files NATURAL JOIN selected h
GROUP BY h.digest
HAVING count(1) > 1
EOSQL
@db.query "select count(1) from (#{sql})" do |rs|
rs.each {|row| $logger.info "have Selected #digests: #{row.first}" }
end
@db.prepare sql do |stmt|
stmt.execute
$logger.info 'am iterating...'
stmt.each &exe
end
end
def create_selected dirs
dirs.map! {|dir| Pathname.new( dir).expand_path.to_s }
sep, pattern = @dedupe_sequence, "^(?:#{dirs.map{|dir|Regexp.quote dir}.join '|'}).*"
$logger.info sprintf( "query for: pattern: %p", pattern)
@db.execute <<-EOSQL, dirs.empty? ? {} : {pattern: pattern}
CREATE TABLE ext.candidates AS
SELECT hashes.digest
FROM hashes NATURAL JOIN files
WHERE dedupe_seq > last_dedupe_seq
#{'AND filename REGEXP :pattern' unless dirs.empty?}
EOSQL
$logger.info 'select digests'
@db.execute <<-EOSQL
CREATE TABLE ext.selected AS
SELECT hashes.digest, hashes.ino, hashes.subvol, loff, last_dedupe_seq
FROM (
SELECT distinct digest
FROM candidates
) h NATURAL JOIN hashes
EOSQL
@db.execute 'DROP TABLE ext.candidates'
$logger.info 'index digests'
@db.execute 'CREATE INDEX ext.sel_digest_idx ON selected (digest)'
end
class Stat
attr_accessor :step, :done, :ok, :skip, :error
def initialize() @step = @done = @ok = @skip = @error = 0 end
def to_s() sprintf "%010d (o%010d|s%010d|e%010d)", @step, @ok, @skip, @error end
def step!() @step += 1 end
def ok!() @done += 1; @ok += 1 end
def skip!() @done += 1; @skip += 1 end
def error! err
STDERR.puts err
@done += 1
@error += 1
end
end
def dedup_digest_files digest, (fst, *es)
stat, block_size = @stat, @config.block_size
#STDERR.printf "%p: %p | %p\n", digest, fst, es
errors, rt = [], 0.0
stat.step!
@db.transaction do
until fst.nil?
x = fst.open 'r'
case x
when DFH
stat.ok!
break
when Exception
stat.error! x
fst, *es = es
end
end
@dedupe_ok_stmt.execute digest: fst.digest, ino: fst.ino, subvol: fst.subvol, seq: fst.dedupe_seq
es.lazy.
reject do |f|
if f.dedupe_seq <= f.last_dedupe_seq
stat.skip!
nil
end
end.
select do |f|
case r = f.open
when DFH
true
else
stat.error! r
nil
end
end.
each_slice 16 do |fs|
begin
t1 = Time.now
rs =
Duperemove.duperemove fst.file,
fst.offset,
[ fst.size - fst.offset, block_size ].min,
fs.map {|f| [f.file, f.offset] }.to_h
t2 = Time.now
rt += t2-t1
#STDERR.printf "%p: results: %p\n", digest, es.map {|e| [ e.filename, rs[e.file] ] }.to_h
fs.each do |e|
err = rs[e.file][1]
if 0 == err
@dedupe_ok_stmt.execute digest: e.digest, ino: e.ino, subvol: e.subvol, seq: e.dedupe_seq
stat.ok!
else
stat.error! DedupError.new( e, err)
end
end
ensure
fs.each &:close
end
end
end
rt
ensure
fst&.close
end
def prepared_statements &exe
@files_stmt = @db.prepare <<-EOSQL
SELECT digest, filename, h.ino, h.subvol, size, loff, dedupe_seq, last_dedupe_seq
FROM selected h NATURAL JOIN files
WHERE digest = :digest
ORDER BY dedupe_seq - last_dedupe_seq
EOSQL
@setseq_stmt = @db.prepare <<-EOSQL if false
UPDATE files
SET dedupe_seq = :value
WHERE filename = :filename AND 0 < dedupe_seq
EOSQL
@dedupe_ok_stmt = @db.prepare <<-EOSQL
UPDATE hashes
SET last_dedupe_seq = :seq
WHERE digest = :digest AND ino = :ino AND subvol = :subvol
EOSQL
@dedupe_ok_stmt = @db.prepare <<-EOSQL if false
INSERT INTO log (ino, subvol, ok)
VALUES (:ino, :subvol, 1)
ON CONFLICT (ino, subvol)
DO UPDATE SET ok = ok + 1
EOSQL
@dedupe_error_stmt = @db.prepare <<-EOSQL if false
INSERT INTO log (ino, subvol, failed)
VALUES (:ino, :subvol, 1)
ON CONFLICT (ino, subvol)
DO UPDATE SET failed = failed + 1
EOSQL
yield
ensure
@files_stmt.close if @files_stmt&.closed?
@setseq_stmt.close if @setseq_stmt&.closed?
@dedupe_ok_stmt.close if @dedupe_ok_stmt&.closed?
end
def run dirs = nil
$logger.info "am starting..."
dirs ||= []
digest_rs = nil
block_size, seq = @config.block_size, @config.dedupe_sequence
create_selected dirs
stat = @stat = Stat.new
prepared_statements do
per_digest do |digest, count|
dgs = Hash.new {|h,k| h[k] = [] }
#STDERR.printf "digest (%d): %p\n", count.to_i, digest
STDERR.print "\r#{stat}"
#@files_stmt.execute rs.flatten do |digest, filename, ino, subvol, size, offset|
t1 = Time.now
@files_stmt.execute digest: digest do |rs|
rs.each do |digest, filename, ino, subvol, size, offset, dedupe_seq, last_dedupe_seq|
dgs[digest].push DFH.new( digest, filename, ino, subvol, size, offset, dedupe_seq, last_dedupe_seq)
end
end
t2 = Time.now
STDERR.printf "\r%s %0.6f", stat.to_s, t2-t1
rt = 0
dgs.each do |digest, fs|
rt += dedup_digest_files( digest, fs)
end
STDERR.printf "\r%s %0.6f %0.6f %0.6f", stat, t2-t1, rt, Time.now-t1
end
end
ensure
STDERR.puts
end
def initialize dbfile, tempfile
@db = SQLite3::Database.new dbfile.to_s
@db.enable_load_extension true
@db.load_extension '/usr/lib/sqlite3/pcre.so'
@config = OpenStruct.new
@db.execute( "SELECT * FROM config") {|k, v| @config[k.to_sym] = v }
@config.version = "#{@config.version_major}.#{@config.version_minor}"
@config.dedupe_sequence = ENV['dedupe_sequence'].to_i if ENV['dedupe_sequence']
@dedupe_sequence = @config.dedupe_sequence
raise "Database created by newer duperemove than supported (expected: 2.0, got: #{@config.version})" unless '2.0' == @config.version
tempfile.unlink if tempfile.exist?
$logger.info "use ext-DB: #{tempfile}"
@db.execute "ATTACH ? AS ext", tempfile.to_s
migrate_last_dedupe_seq
end
def migrate_last_dedupe_seq
found = false
@db.query 'PRAGMA table_info(hashes)' do |rs|
found = rs.any? {|(_, name, _,_,_,_)| 'last_dedupe_seq' == name }
end
unless found
$logger.info "add missing column last_dedupe_seq to table hashes"
@db.execute 'ALTER TABLE hashes ADD COLUMN last_dedupe_seq INTEGER DEFAULT ?',
@config.dedupe_sequence
end
end
def finish
$logger.info "finish (sequence: #{@dedupe_sequence})"
end
end

View file

@ -0,0 +1,143 @@
# vim: set noet sw=2 ts=2 sts=2:
require 'ffi'
require 'pathname'
Errno::Errnos = []
Errno.constants.
map {|c| Errno.const_get c }.
select {|c| c.is_a?( Class) and c.superclass == SystemCallError and c.const_defined?( :Errno) }.
each {|c| Errno::Errnos[c::Errno] = c }
module FileDedupeRange
extend FFI::Library
ffi_lib Pathname.new( __FILE__).
dirname.
join( *w[ext deduperemoverb deduperemoverb.so]).
expand_path.
to_s
# struct file_dedupe_range_info {
# __s64 dest_fd; /* in - destination file */
# __u64 dest_offset; /* in - start of extent in destination */
# __u64 bytes_deduped; /* out - total # of bytes we were able to dedupe from this file. */
# /* status of this dedupe operation:
# * < 0 for error
# * == FILE_DEDUPE_RANGE_SAME if dedupe succeeds
# * == FILE_DEDUPE_RANGE_DIFFERS if data differs
# */
# __s32 status; /* out */
# __u32 reserved; /* must be zero */
# };
class Destination < FFI::Struct
layout :fd, :int64,
:offset, :uint64,
:bytes_deduped, :uint64,
:status, :int32,
:reserved, :uint32
def fd=(i) self[:fd] = i end
def fd() self[:fd] end
def offset=(i) self[:offset] = i end
def offset() self[:offset] end
def bytes_deduped() self[:bytes_deduped] end
def status() self[:status] end
alias initialize_without_defaults initialize
def initialize_with_defaults *a
initialize_without_defaults *a
self[:reserved] = self[:bytes_deduped] = self[:status] = 0
end
alias initialize initialize_with_defaults
def inspect
sprintf "#<%s fd=%d offset=%d bytes_deduped=%d status=%d>",
self.class.name,
self[:fd],
self[:offset],
self[:bytes_deduped],
self[:status]
end
end
# /* from struct btrfs_ioctl_file_extent_same_args */
# struct file_dedupe_range {
# __u64 src_offset; /* in - start of extent in source */
# __u64 src_length; /* in - length of extent */
# __u16 dest_count; /* in - total elements in info array */
# __u16 reserved1; /* must be zero */
# __u32 reserved2; /* must be zero */
# struct file_dedupe_range_info info[0];
# };
class Base < FFI::Struct
class <<self
private
def layout_init dest_count
layout :offset, :uint64,
:length, :uint64,
:dest_count, :uint16,
:reserved1, :uint16,
:reserved2, :uint16,
:destinations, [Destination, dest_count]
end
public
def inspect
"#{superclass.name}[?]"
end
end
def offset() self[:offset] end
def length() self[:length] end
def dest_count() self[:dest_count] end
def destinations() self[:destinations] end
def initialize offset, length
self[:offset], self[:length] = offset, length
self[:reserved1] = self[:reserved2] = 0
self[:dest_count] = layout[:destinations].size / Destination.size
end
def inspect
sprintf "#<%s[%d] %d:%d dests=[%s]>",
self.class.superclass.name,
self[:dest_count].to_i,
self[:offset],
self[:length],
destinations.map {|d|
sprintf "#<%d:%d %d - %d>", d[:fd], d[:offset], d[:bytes_deduped], d[:status]
}.join( ',')
end
end
class <<self
@@classes = []
def new offset:, length:, dest_count:
r = self[dest_count].new offset, length
end
def [] dest_count
@@classes[dest_count] ||=
Class.new( Base).tap {|klass| klass.send :layout_init, dest_count }
end
def errno
Errno::Errnos[_errno]
end
end
# int ioctl_fideduperange(int src_fd, struct file_dedupe_range *arg);
attach_function :dedup, :fideduperange, [:int, FileDedupeRange::Base.ptr], :int
# fetch last errno after last calling dedup
attach_function :_errno, :fideduperange_errno, [], :int
# fetch constants from C-world, which are only provided as macro
attach_function :_consts, :fideduperange_consts, [:pointer], :void
x = FFI::MemoryPointer.new :int64, 3
_consts x
REQUEST_CONST, SAME, DIFFERS = x.read_array_of_int64( 3)
end