commit 3cc57e2e78f9c552375879d41338c975a4617160 Author: Denis Knauf Date: Thu Sep 2 23:18:31 2021 +0200 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3b2e1a2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.sw[opn] +*.so +*.o +Makefile +/tmp +/pkg diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..161595d --- /dev/null +++ b/Gemfile @@ -0,0 +1,4 @@ +source 'https://rubygems.org' +gem 'ffi' +gem 'sqlite3' +gem 'rake-compiler' diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..b689f38 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,19 @@ +GEM + remote: https://rubygems.org/ + specs: + ffi (1.15.3) + rake (13.0.6) + rake-compiler (1.1.1) + rake + sqlite3 (1.4.2) + +PLATFORMS + x86_64-linux + +DEPENDENCIES + ffi + rake-compiler + sqlite3 + +BUNDLED WITH + 2.2.25 diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..489e109 --- /dev/null +++ b/Rakefile @@ -0,0 +1,21 @@ +# vim: set noet sw=2 ts=2 sts=2: + +require 'rake' +require 'rake/extensiontask' +require 'bundler' + +Rake::ExtensionTask.new( "deduperemoverb") do |extension| + extension.lib_dir = "ext" +end + +task :chmod do + File.chmod 0775, 'ext/deduperemoverb.so' +end + +task :clean do + File.unlink 'ext/deduperemoverb.so' if File.exists? 'ext/deduperemoverb.so' +end + +task :build => [:clean, :compile, :chmod] + +Bundler::GemHelper.install_tasks name: 'deduperemoverb' diff --git a/bin/deduperemoverb.rb b/bin/deduperemoverb.rb new file mode 100755 index 0000000..56018c5 --- /dev/null +++ b/bin/deduperemoverb.rb @@ -0,0 +1,33 @@ +#!/usr/bin/env ruby +# vim: set noet sw=2 ts=2 sts=2: + +require 'deduperemoverb' +require 'ostruct' +require 'logger' + +$logger = Logger.new STDERR +$logger.formatter = + proc do |severity, datetime, progname, message| + sprintf "%s %s %s\n", datetime.strftime( '%H:%M:%S.%6N'), severity[0], message + end + +begin + raise "Argument 0: Database expected" if ARGV.empty? + dbfile = Pathname.new ARGV[0] + raise "Argument 0: Database missing: #{dbfile}" unless dbfile.exist? + tempfile = Pathname.new("/run/user/#{Process.uid}").join dbfile.basename + dr = Duperemove.new dbfile, tempfile + dr.run ARGV[1..-1] + dr.finish + +rescue SystemExit + raise + +rescue Interrupt + $logger.info "was interrupted" + exit 1 + +rescue + $logger.fatal "#$! (#{$!.class})\n#{$!.backtrace.join "\n\t"}" + +end diff --git a/deduperemoverb.gemspec b/deduperemoverb.gemspec new file mode 100644 index 0000000..214e496 --- /dev/null +++ b/deduperemoverb.gemspec @@ -0,0 +1,28 @@ +# vim: set noet sw=2 ts=2 sts=2: +require 'rake' + +Gem::Specification.new do |s| + s.name = 'deduperemoverb' + s.version = '0.0.1' + s.licenses = %w[LGPLv3] + s.authors = 'Denis Knauf' + s.homepage = 'https://git.denkn.at/deac/deduperemoverb' + s.summary = 'Deduplication of file junks of found files by deduperemove.' + s.files = + FileList[ + 'lib/**/*.rb', + 'bin/*', + 'Gemfile', + 'Gemfile.lock', + 'ext/*/*.c', + 'ext/*/extconf.rb' + ] + s.require_paths = %w[lib] + s.extensions = %w[ext/deduperemoverb/extconf.rb] + + s.add_development_dependency "rake", "~> 13" + s.add_dependency "ffi", '~> 1.15' + s.add_dependency "sqlite3", '~> 1.4' + + #s.install_tasks name: 'compile' +end diff --git a/ext/deduperemoverb/deduperemoverb.c b/ext/deduperemoverb/deduperemoverb.c new file mode 100644 index 0000000..2a5e60d --- /dev/null +++ b/ext/deduperemoverb/deduperemoverb.c @@ -0,0 +1,22 @@ +#include +#include +#include + +int fideduperange_errno_value = 0; + +int fideduperange( int src_fd, struct file_dedupe_range *arg) { + fideduperange_errno_value = 0; + int r = ioctl( src_fd, FIDEDUPERANGE, arg); + fideduperange_errno_value = errno; + return r; +} + +int fideduperange_errno() { + return fideduperange_errno_value; +} + +void fideduperange_consts( int config[3]) { + config[0] = FIDEDUPERANGE; + config[1] = FILE_DEDUPE_RANGE_SAME; + config[2] = FILE_DEDUPE_RANGE_DIFFERS; +} diff --git a/ext/deduperemoverb/extconf.rb b/ext/deduperemoverb/extconf.rb new file mode 100644 index 0000000..6901457 --- /dev/null +++ b/ext/deduperemoverb/extconf.rb @@ -0,0 +1,3 @@ +require "mkmf" + +create_makefile "deduperemoverb" diff --git a/lib/deduperemoverb.rb b/lib/deduperemoverb.rb new file mode 100755 index 0000000..31a3a5d --- /dev/null +++ b/lib/deduperemoverb.rb @@ -0,0 +1,359 @@ +#!/usr/bin/env ruby +# vim: set noet sw=2 ts=2 sts=2: + +require 'sqlite3' +require 'pathname' +require 'ostruct' +require 'deduperemoverb/file_dedupe_range' + +class SQLite3::ResultSet + def each_enum + loop do + row = @stmt.step + return nil if @stmt.done? + row = @db.translate_from_db @stmt.types, row + yield ArrayWithTypesAndFields.new( row) + end + end +end + +def _recursive dir, &exe + Dir.each_child dir do |e| + next if '.' == e or '..' == e + e = File.join dir, e + if File.directory? e + _recursive e + elsif File.file? e + yield e + end + end +end + +def recursive dir, &exe + return to_enum( __method__, dir) unless block_given? + dir = dir.to_s + if File.directory? dir + _recursive dir, &exe + elsif File.file? dir + yield dir + else + raise ArgumentError, "Directory [#{dir}] does not exist." if File.exist? dir + end +end + +def hash_file path, chunksize, &exe + File.open path do |f| + b = f.pos + s = f.read chunksize + e = f.pos + unless chunksize == s.bytesize + if f.eof? + yield s, b + return + else + raise "Read lesser than chunksize, but did not reach end of file. [#{path}]" + end + end + yield s, b + end + return +end + +def hash_recursive dir, &exe + recursive dir do |path| + ino = File::Stat.new( path).ino + hash_file path do |dgs, pos| + yield path, ino, dgs, pos + end + end +end + + +class Duperemove + class DFH + class OpenFailed < Exception + attr_reader :file, :error + def initialize file, error + @file, @error = file, error + super "Open file `#{file}` failed: #{error}" + end + end + + attr_reader :digest, :filename, :ino, :subvol, :size, :offset, :file, :dedupe_seq, :last_dedupe_seq + + def initialize digest, filename, ino, subvol, size, offset, dedupe_seq, last_dedupe_seq + @digest, @filename, @ino, @subvol, @size, @offset, @dedupe_seq, @last_dedupe_seq = + digest, filename, ino, subvol, size, offset, dedupe_seq, last_dedupe_seq + @file = nil + end + + def open mode = nil + f = File.new @filename, mode || 'r+' + s = f.stat + unless s.ino == @ino + f.close + return OpenFailed.new( @filename, "Not same inode: actual file #{s.ino} <=> hashed #{@ino}") + end + unless s.size == @size # and s.subvol == @subvol + f.close + return OpenFailed.new( @filename, "Size differs: actual file #{s.size} <=> hashed #{@size}") + end + @file = f + self + rescue Exception + f.close + return OpenFailed.new( @filename, "#{$!.to_s} (#{$!.class.name})") + end + + def close + @file.close if @file and not @file.closed? + end + end + + + def self.duperemove fd, offset, length, dupes + rr = FileDedupeRange[dupes.size].new offset, length + fds = [] + dupes.each_with_index do |(fd,os),i| + fn = fd.is_a?(File) ? fd.fileno : fd + fds[fn] = fd + rr[:destinations][i][:fd] = fn + rr[:destinations][i][:offset] = os + end + fn = fd.is_a?(File) ? fd.fileno : fd + rv = FileDedupeRange.dedup fn, rr + raise FileDedupeRange.errno, "Deduplication failed for file" if -1 == rv + r = {} + rr[:destinations].each {|d| r[fds[d[:fd]]] = [d[:bytes_deduped], d[:status]] } + r + end + + def per_digest &exe + return to_enum( __method__, db) unless block_given? + sql = <<-EOSQL + SELECT h.digest, count(1) + FROM files NATURAL JOIN selected h + GROUP BY h.digest + HAVING count(1) > 1 + EOSQL + @db.query "select count(1) from (#{sql})" do |rs| + rs.each {|row| $logger.info "have Selected #digests: #{row.first}" } + end + @db.prepare sql do |stmt| + stmt.execute + $logger.info 'am iterating...' + stmt.each &exe + end + end + + def create_selected dirs + dirs.map! {|dir| Pathname.new( dir).expand_path.to_s } + sep, pattern = @dedupe_sequence, "^(?:#{dirs.map{|dir|Regexp.quote dir}.join '|'}).*" + $logger.info sprintf( "query for: pattern: %p", pattern) + @db.execute <<-EOSQL, dirs.empty? ? {} : {pattern: pattern} + CREATE TABLE ext.candidates AS + SELECT hashes.digest + FROM hashes NATURAL JOIN files + WHERE dedupe_seq > last_dedupe_seq + #{'AND filename REGEXP :pattern' unless dirs.empty?} + EOSQL + $logger.info 'select digests' + @db.execute <<-EOSQL + CREATE TABLE ext.selected AS + SELECT hashes.digest, hashes.ino, hashes.subvol, loff, last_dedupe_seq + FROM ( + SELECT distinct digest + FROM candidates + ) h NATURAL JOIN hashes + EOSQL + @db.execute 'DROP TABLE ext.candidates' + $logger.info 'index digests' + @db.execute 'CREATE INDEX ext.sel_digest_idx ON selected (digest)' + end + + class Stat + attr_accessor :step, :done, :ok, :skip, :error + def initialize() @step = @done = @ok = @skip = @error = 0 end + def to_s() sprintf "%010d (o%010d|s%010d|e%010d)", @step, @ok, @skip, @error end + def step!() @step += 1 end + def ok!() @done += 1; @ok += 1 end + def skip!() @done += 1; @skip += 1 end + def error! err + STDERR.puts err + @done += 1 + @error += 1 + end + end + + def dedup_digest_files digest, (fst, *es) + stat, block_size = @stat, @config.block_size + #STDERR.printf "%p: %p | %p\n", digest, fst, es + errors, rt = [], 0.0 + stat.step! + @db.transaction do + until fst.nil? + x = fst.open 'r' + case x + when DFH + stat.ok! + break + when Exception + stat.error! x + fst, *es = es + end + end + @dedupe_ok_stmt.execute digest: fst.digest, ino: fst.ino, subvol: fst.subvol, seq: fst.dedupe_seq + + es.lazy. + reject do |f| + if f.dedupe_seq <= f.last_dedupe_seq + stat.skip! + nil + end + end. + select do |f| + case r = f.open + when DFH + true + else + stat.error! r + nil + end + end. + each_slice 16 do |fs| + begin + t1 = Time.now + rs = + Duperemove.duperemove fst.file, + fst.offset, + [ fst.size - fst.offset, block_size ].min, + fs.map {|f| [f.file, f.offset] }.to_h + t2 = Time.now + rt += t2-t1 + #STDERR.printf "%p: results: %p\n", digest, es.map {|e| [ e.filename, rs[e.file] ] }.to_h + fs.each do |e| + err = rs[e.file][1] + if 0 == err + @dedupe_ok_stmt.execute digest: e.digest, ino: e.ino, subvol: e.subvol, seq: e.dedupe_seq + stat.ok! + else + stat.error! DedupError.new( e, err) + end + end + ensure + fs.each &:close + end + end + end + rt + ensure + fst&.close + end + + def prepared_statements &exe + @files_stmt = @db.prepare <<-EOSQL + SELECT digest, filename, h.ino, h.subvol, size, loff, dedupe_seq, last_dedupe_seq + FROM selected h NATURAL JOIN files + WHERE digest = :digest + ORDER BY dedupe_seq - last_dedupe_seq + EOSQL + + @setseq_stmt = @db.prepare <<-EOSQL if false + UPDATE files + SET dedupe_seq = :value + WHERE filename = :filename AND 0 < dedupe_seq + EOSQL + + @dedupe_ok_stmt = @db.prepare <<-EOSQL + UPDATE hashes + SET last_dedupe_seq = :seq + WHERE digest = :digest AND ino = :ino AND subvol = :subvol + EOSQL + @dedupe_ok_stmt = @db.prepare <<-EOSQL if false + INSERT INTO log (ino, subvol, ok) + VALUES (:ino, :subvol, 1) + ON CONFLICT (ino, subvol) + DO UPDATE SET ok = ok + 1 + EOSQL + + @dedupe_error_stmt = @db.prepare <<-EOSQL if false + INSERT INTO log (ino, subvol, failed) + VALUES (:ino, :subvol, 1) + ON CONFLICT (ino, subvol) + DO UPDATE SET failed = failed + 1 + EOSQL + + yield + ensure + @files_stmt.close if @files_stmt&.closed? + @setseq_stmt.close if @setseq_stmt&.closed? + @dedupe_ok_stmt.close if @dedupe_ok_stmt&.closed? + end + + def run dirs = nil + $logger.info "am starting..." + dirs ||= [] + digest_rs = nil + block_size, seq = @config.block_size, @config.dedupe_sequence + create_selected dirs + stat = @stat = Stat.new + + prepared_statements do + per_digest do |digest, count| + dgs = Hash.new {|h,k| h[k] = [] } + #STDERR.printf "digest (%d): %p\n", count.to_i, digest + STDERR.print "\r#{stat}" + #@files_stmt.execute rs.flatten do |digest, filename, ino, subvol, size, offset| + t1 = Time.now + @files_stmt.execute digest: digest do |rs| + rs.each do |digest, filename, ino, subvol, size, offset, dedupe_seq, last_dedupe_seq| + dgs[digest].push DFH.new( digest, filename, ino, subvol, size, offset, dedupe_seq, last_dedupe_seq) + end + end + t2 = Time.now + STDERR.printf "\r%s %0.6f", stat.to_s, t2-t1 + + rt = 0 + dgs.each do |digest, fs| + rt += dedup_digest_files( digest, fs) + end + STDERR.printf "\r%s %0.6f %0.6f %0.6f", stat, t2-t1, rt, Time.now-t1 + end + end + + ensure + STDERR.puts + end + + def initialize dbfile, tempfile + @db = SQLite3::Database.new dbfile.to_s + @db.enable_load_extension true + @db.load_extension '/usr/lib/sqlite3/pcre.so' + @config = OpenStruct.new + @db.execute( "SELECT * FROM config") {|k, v| @config[k.to_sym] = v } + @config.version = "#{@config.version_major}.#{@config.version_minor}" + @config.dedupe_sequence = ENV['dedupe_sequence'].to_i if ENV['dedupe_sequence'] + @dedupe_sequence = @config.dedupe_sequence + raise "Database created by newer duperemove than supported (expected: 2.0, got: #{@config.version})" unless '2.0' == @config.version + tempfile.unlink if tempfile.exist? + $logger.info "use ext-DB: #{tempfile}" + @db.execute "ATTACH ? AS ext", tempfile.to_s + migrate_last_dedupe_seq + end + + def migrate_last_dedupe_seq + found = false + @db.query 'PRAGMA table_info(hashes)' do |rs| + found = rs.any? {|(_, name, _,_,_,_)| 'last_dedupe_seq' == name } + end + unless found + $logger.info "add missing column last_dedupe_seq to table hashes" + @db.execute 'ALTER TABLE hashes ADD COLUMN last_dedupe_seq INTEGER DEFAULT ?', + @config.dedupe_sequence + end + end + + def finish + $logger.info "finish (sequence: #{@dedupe_sequence})" + end +end diff --git a/lib/deduperemoverb/file_dedupe_range.rb b/lib/deduperemoverb/file_dedupe_range.rb new file mode 100644 index 0000000..81fe83f --- /dev/null +++ b/lib/deduperemoverb/file_dedupe_range.rb @@ -0,0 +1,143 @@ +# vim: set noet sw=2 ts=2 sts=2: + +require 'ffi' +require 'pathname' + +Errno::Errnos = [] +Errno.constants. + map {|c| Errno.const_get c }. + select {|c| c.is_a?( Class) and c.superclass == SystemCallError and c.const_defined?( :Errno) }. + each {|c| Errno::Errnos[c::Errno] = c } + +module FileDedupeRange + extend FFI::Library + ffi_lib Pathname.new( __FILE__). + dirname. + join( *w[ext deduperemoverb deduperemoverb.so]). + expand_path. + to_s + + # struct file_dedupe_range_info { + # __s64 dest_fd; /* in - destination file */ + # __u64 dest_offset; /* in - start of extent in destination */ + # __u64 bytes_deduped; /* out - total # of bytes we were able to dedupe from this file. */ + # /* status of this dedupe operation: + # * < 0 for error + # * == FILE_DEDUPE_RANGE_SAME if dedupe succeeds + # * == FILE_DEDUPE_RANGE_DIFFERS if data differs + # */ + # __s32 status; /* out */ + # __u32 reserved; /* must be zero */ + # }; + class Destination < FFI::Struct + layout :fd, :int64, + :offset, :uint64, + :bytes_deduped, :uint64, + :status, :int32, + :reserved, :uint32 + + def fd=(i) self[:fd] = i end + def fd() self[:fd] end + def offset=(i) self[:offset] = i end + def offset() self[:offset] end + def bytes_deduped() self[:bytes_deduped] end + def status() self[:status] end + + alias initialize_without_defaults initialize + def initialize_with_defaults *a + initialize_without_defaults *a + self[:reserved] = self[:bytes_deduped] = self[:status] = 0 + end + alias initialize initialize_with_defaults + + def inspect + sprintf "#<%s fd=%d offset=%d bytes_deduped=%d status=%d>", + self.class.name, + self[:fd], + self[:offset], + self[:bytes_deduped], + self[:status] + end + end + + # /* from struct btrfs_ioctl_file_extent_same_args */ + # struct file_dedupe_range { + # __u64 src_offset; /* in - start of extent in source */ + # __u64 src_length; /* in - length of extent */ + # __u16 dest_count; /* in - total elements in info array */ + # __u16 reserved1; /* must be zero */ + # __u32 reserved2; /* must be zero */ + # struct file_dedupe_range_info info[0]; + # }; + class Base < FFI::Struct + class <", + self.class.superclass.name, + self[:dest_count].to_i, + self[:offset], + self[:length], + destinations.map {|d| + sprintf "#<%d:%d %d - %d>", d[:fd], d[:offset], d[:bytes_deduped], d[:status] + }.join( ',') + end + end + + class <