From: direct Date: Wed, 25 May 2022 06:59:13 +0000 (+1000) Subject: Последняя альфа. Предполагается, что весь основной функционал реализован X-Git-Url: http://git.ultra-x.su/?a=commitdiff_plain;h=b06df91bcb87f9bfef95a09d6b466ac068583aeb;p=dev Последняя альфа. Предполагается, что весь основной функционал реализован --- diff --git a/dedupler/dedupler.php b/dedupler/dedupler.php index 10a1fee..5cb7d20 100644 --- a/dedupler/dedupler.php +++ b/dedupler/dedupler.php @@ -1,6 +1,6 @@ #!/usr/bin/env php $val) { - preg_match_all('/(.{0,60})(\ |$)/', $descr[$key],$parts); + preg_match_all('/(.{0,57})(\ |$)/', $descr[$key], $parts); foreach ($parts[1] as $part) { if ($part == '') { continue; } - echo "\t", $val, "\t", $part, PHP_EOL; + if (strlen($val)> 3) + { + $_DELIM = "\t"; + } + else + { + $_DELIM = "\t\t"; + } + echo " ", $val, $_DELIM, $part, PHP_EOL; $val = ''; } } echo PHP_EOL; } -function reporter() +function reporter($group, $status, $path) +{ + global $_CSVREPORT; + if (is_resource($_CSVREPORT)) + { + fputcsv($_CSVREPORT, array($group, $status, $path), ';'); + } +} + +function shortmd5file ($filepath) +{ + $fh = @fopen($filepath, 'r'); + if(!$fh) + { + return false; + } + $head = fread($fh, 40); + $seekstat = fseek($fh, -40, SEEK_END); + $tail = fread($fh, 40); + fclose($fh); + return hash('crc32b', $head.$tail); +} + +function size_humanreadable($size) +{ + $size = (int) $size; + if ($size > 1099511627776) + { + $hr_size = round($size / 1099511627776, 2); + $hr_size .= ' TiBytes'; + } + elseif ($size > 1073741824) + { + $hr_size = round($size / 1073741824, 2); + $hr_size .= ' GiBytes'; + } + elseif ($size > 1048576) + { + $hr_size = round($size / 1048576, 2); + $hr_size .= ' MiBytes'; + } + elseif ($size > 1024) + { + $hr_size = round($size / 1024, 2); + $hr_size .= ' KiBytes'; + } + else + { + $hr_size = $size; + $hr_size .= ' Bytes'; + } + return $hr_size; +} + +function verbose ($str) { global $opts; - if (!isset($opts['r'])) + if (isset($opts['v'])) { - return; + echo $str; } - } + //инициализация $counter_dirs = 0; $counter_files = 0; $counter_files_total = 0; $tmpfile = '/dev/shm/dedupler.tmp'; $optind = null; -$opts = getopt('nrdlsihvVI::'); +$opts = getopt('ndlsihvVI::r::'); $dir = false; $ignore_size = false; //var_dump($opts);exit; - -function shortmd5file ($filepath) -{ - $fh = @fopen($filepath, 'r'); - if(!$fh) - { - return false; - } - $head = fread($fh, 512); - $seekstat = fseek($fh, -512, SEEK_END); - $tail = fread($fh, 512); - fclose($fh); - return md5($head.$tail); -} +echo PHP_EOL; // проверка на доступность модуля sqlite для PDO if (array_search('sqlite',PDO::getAvailableDrivers()) === false) @@ -163,6 +212,24 @@ if (isset($opts['h'])) exit(); } +if (isset($opts['r'])) +{ + if ($opts['r'] === false || is_array($opts['r'])) + { + $repfile = "php://stderr"; + } + else + { + $repfile = $opts['r']; + } + $_CSVREPORT = fopen($repfile, 'w'); + if ($_CSVREPORT === false) + { + echo $repfile,'can not be opened! CSV report disabled...', PHP_EOL; + unset($_CSVREPORT); + } +} + if (isset($opts['I'])) { // if (!is_array($opts['I'])) @@ -183,13 +250,13 @@ if (isset($opts['I'])) switch ($sizesuffix) { case 'k': - $ignore_size = $sizeprefix * 1000; + $ignore_size = $sizeprefix * 1024; break; case 'm': - $ignore_size = $sizeprefix * 1000000; + $ignore_size = $sizeprefix * 1048576; break; case 'g': - $ignore_size = $sizeprefix * 1000000000; + $ignore_size = $sizeprefix * 1073741824; break; default: @@ -199,10 +266,6 @@ if (isset($opts['I'])) } } -if ($ignore_size !== false) -{ - echo "Warning! Files having size smaller then {$ignore_size} bytes will be ignored.", PHP_EOL; -} //var_dump($ignore_size);exit; @@ -226,6 +289,12 @@ if (isset($opts['i'])) { echo "Ignoring UID GID and PERMS for compare files",PHP_EOL; } + +if ($ignore_size !== false) +{ + echo "Warning! Files having size smaller then ", size_humanreadable($ignore_size)," will be ignored.", PHP_EOL; +} + $_actmode = 0; if (isset($opts['d'])) @@ -289,6 +358,7 @@ $DBH->exec ('CREATE TABLE inodev ) '); $DBH->exec ('CREATE INDEX filesize ON fstree (size)'); +$DBH->exec ('CREATE INDEX inodev1 ON fstree (inodev)'); $DBH->exec ('CREATE INDEX inodev2 ON inodev (inodev)'); $DBH->exec ('CREATE INDEX uidgid ON inodev (uidgid)'); $DBH->exec ('CREATE INDEX tailhead ON inodev (tailhead)'); @@ -298,8 +368,6 @@ $DBH->exec ('CREATE INDEX md5 ON inodev (md5)'); // подготовка запросов $DB_AddPath = $DBH->prepare("INSERT INTO fstree (path, size) values (?, ?)"); -$DB_GetBySize = $DBH->prepare("select id, path from fstree where size = ?"); -$DB_GetSizeByGroup = $DBH->prepare('select size from fstree GROUP BY `size`'); $DB_PurgeUnique = $DBH->prepare('delete from fstree where id IN (select id from fstree GROUP BY `size` HAVING count(*)=1)'); $DB_AddInodev = $DBH->prepare('update fstree set inodev = ? where id = ?'); $DB_AddMeta = $DBH->prepare('insert or ignore into inodev (inodev, uidgid) values (?, ?)'); @@ -309,10 +377,12 @@ $DB_GetAllPath = $DBH->prepare('select id, path from fstree'); $DB_GetAllPathCount = $DBH->prepare('select count(*) from fstree'); $DB_GetAllUidgid = $DBH->prepare('select uidgid from inodev group by uidgid'); $DB_GetInodevsByUid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? group by inodev'); +$DB_GetInodevsByUidCount = $DBH->prepare('select count(*) from (select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? group by inodev)'); $DB_GetInodevsBySameTailheadUid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? and tailhead in (select tailhead from inodev where uidgid = ? and tailhead not null group by tailhead HAVING count(*)>1) group by inodev'); -$DB_GetInodevsBySameMd5Uid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? and md5 in (select md5 from inodev where uidgid = ? and md5 not null group by md5 HAVING count(*)>1) group by inodev'); +$DB_GetInodevsBySameTailheadUidCount = $DBH->prepare('select count(*) from (select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? and tailhead in (select tailhead from inodev where uidgid = ? and tailhead not null group by tailhead HAVING count(*)>1) group by inodev)'); $DB_GetAllMd5Dup = $DBH->prepare('select md5 from inodev where uidgid = ? and md5 not null group by md5 HAVING count(*)>1;'); $DB_GetPathByMd5 = $DBH->prepare('select inodev, path from inodev join fstree using (inodev) where md5 = ?'); +$DB_GetSizeByInodev = $DBH->prepare('select size from fstree where inodev = ? limit 1'); //чтение дерева каталогов с заносом в базу и очисткой от заведомо уникальных файлов @@ -320,7 +390,7 @@ $DBH->beginTransaction(); $tree = get_dir_tree($dir); $DBH->commit(); echo PHP_EOL; -echo 'purge known unique files from tree...', PHP_EOL; +echo 'Purge known unique files from tree...', PHP_EOL; $DB_PurgeUnique->execute(); //анализ оставшихся файлов, явно имеющих дубли по признаку размера @@ -334,7 +404,7 @@ while ($idpath = $DB_GetAllPath->fetch()) // получение метаданных о файле и заполнение БД $cnt_files++; $path = $idpath['path']; - echo $total_files," / ", $cnt_files, "\r"; + echo "Total/Current: ",$total_files," / ", $cnt_files, "\r"; $stat = stat($path); $inodev = $stat['dev']."+".$stat['ino']; if (isset($opts['i'])) @@ -352,46 +422,56 @@ while ($idpath = $DB_GetAllPath->fetch()) echo PHP_EOL; //анализ групп файлов с одинаковым uidgid -echo 'analyzing groups of files having similar UID and GID...', PHP_EOL; +echo 'Analyzing groups of files having similar UID and GID...', PHP_EOL; $DB_GetAllUidgid->execute(); while ($uidgid = $DB_GetAllUidgid->fetch()) { $uidgid = implode($uidgid); - echo PHP_EOL, "now analyzing group of files having UID GID $uidgid", PHP_EOL; + echo PHP_EOL, "Now analyzing group of files having UID GID $uidgid", PHP_EOL; // получение и заполнение tailhead для всех файлов юзера дублирующихся по размеру + echo 'Reading head and tail of the files...',PHP_EOL; + $DB_GetInodevsByUidCount->execute(array($uidgid)); + $total_files = $DB_GetInodevsByUidCount->fetchColumn(); + $cnt_files = 0; $DB_GetInodevsByUid->execute(array($uidgid)); $DBH->beginTransaction(); - $cnt_files = 0; while ($InodevData = $DB_GetInodevsByUid->fetch()) { $cnt_files++; - echo $cnt_files,"\r"; + echo "Total/Current: ",$total_files," / ", $cnt_files, "\r"; $shortmd5 = shortmd5file($InodevData['path']); $DB_AddTailhead->execute(array($shortmd5, $InodevData['inodev'])); } $DBH->commit(); + echo PHP_EOL; // получение и заполнение md5 для всех файлов юзера дублирующихся по tailhead + echo 'Reading full content of the files having the same tail and head...',PHP_EOL; + $DB_GetInodevsBySameTailheadUidCount->execute(array($uidgid,$uidgid)); + $total_files = $DB_GetInodevsBySameTailheadUidCount->fetchColumn(); + $cnt_files = 0; $DB_GetInodevsBySameTailheadUid->execute(array($uidgid,$uidgid)); $DBH->beginTransaction(); - $cnt_files = 0; while ($InodevData = $DB_GetInodevsBySameTailheadUid->fetch()) { $cnt_files++; - echo $cnt_files,"\r"; - $md5 = md5_file($InodevData['path']); + echo "Total/Current: ",$total_files," / ", $cnt_files, "\r"; + $md5 = hash_file('md5',$InodevData['path']); $DB_AddMd5->execute(array($md5, $InodevData['inodev'])); } $DBH->commit(); + echo PHP_EOL,PHP_EOL; // обработка копий, выявленных хешированием md5 $DB_GetAllMd5Dup->execute(array($uidgid)); $cnt_files = 0; + $DUP_TOTALSIZE = 0; + $DUP_FSIZE = 0; while ($md5 = $DB_GetAllMd5Dup->fetch()) { $cnt_files++; - echo $cnt_files,"\r"; + verbose("Group ".$cnt_files.PHP_EOL); $md5 = implode($md5); $DB_GetPathByMd5->execute(array($md5)); $MD5GRP = array(); @@ -408,78 +488,92 @@ while ($uidgid = $DB_GetAllUidgid->fetch()) $first_inodev = array_key_first($INODEV_SIZE); $first_inodev_path = array_shift($MD5GRP[$first_inodev]); unset($MD5GRP[$first_inodev]); - echo "original is $first_inodev -> $first_inodev_path",PHP_EOL; + verbose ("Original is $first_inodev_path".PHP_EOL); + reporter($cnt_files, "O", $first_inodev_path); + $DB_GetSizeByInodev->execute(array($first_inodev)); + $DUP_FSIZE = $DB_GetSizeByInodev->fetchColumn(); foreach ($MD5GRP as $inodev => $paths) { foreach ($paths as $path) { - echo " copy is $inodev -> $path"; + $DUP_TOTALSIZE = $DUP_TOTALSIZE + $DUP_FSIZE; + verbose (" Copy is $path"); + reporter($cnt_files, "C", $path); + switch ($_actmode) { case 0: - echo " ...do nothing",PHP_EOL; - + verbose(" ...do nothing".PHP_EOL); break; - - default: + case 1: + if (@unlink($path)) + { + verbose (" ...deleted".PHP_EOL); + } + else + { + verbose (" ...delete failed!".PHP_EOL); + } break; + + case 2: + if (@rename($path, $path.".tmpfb")) + { + if(@link($first_inodev_path, $path)) + { + if (@unlink($path.".tmpfb")) + { + verbose(" ...hardlinked".PHP_EOL); + } + else + { + @rename($path.".tmpfb", $path); + verbose (" ...hardlinking failed!".PHP_EOL); + } + } + else + { + @rename($path.".tmpfb", $path); + verbose (" ...hardlinking failed!".PHP_EOL); + } + } + else + { + verbose (" ...hardlinking failed!".PHP_EOL); + } + break; + + case 3: + if (@rename($path, $path.".tmpfb")) + { + if(@symlink($first_inodev_path, $path)) + { + if (@unlink($path.".tmpfb")) + { + verbose(" ...symlinked".PHP_EOL); + } + else + { + @rename($path.".tmpfb", $path); + verbose(" ...symlinking failed!".PHP_EOL); + } + } + else + { + @rename($path.".tmpfb", $path); + verbose(" ...symlinking failed!".PHP_EOL); + } + } + else + { + verbose(" ...symlinking failed!".PHP_EOL); + } + break; } } } - echo PHP_EOL; + verbose(PHP_EOL); } - - -// $DB_GetInodevsBySameMd5Uid->execute(array($uidgid,$uidgid)); -// while ($InodevData = $DB_GetInodevsBySameMd5Uid->fetch()) -// { var_dump($InodevData); -// $md5 = md5_file($InodevData['path']); -// $DB_AddMd5->execute(array($md5, $InodevData['inodev'])); -// } - -// if (!isset($first_flag)) -// { -// $first_path = $path; -// unset($first_md5); -// $first_inodev = $stat['dev']."+".$stat['ino']; -// $first_uidgid = $stat['uid']."+".$stat['gid']; -// $first_flag = TRUE; -// echo $path, ' is first file',PHP_EOL; -// } -// else -// { -// $cmp_inodev = $stat['dev']."+".$stat['ino']; -// $cmp_uidgid = $stat['uid']."+".$stat['gid']; -// if ($first_inodev == $cmp_inodev) -// { -// echo $path, ' is hardlink',PHP_EOL; -// continue; -// } -// if (!isset($first_md5)) -// { -// echo 'calculate md5 for first file',PHP_EOL; -// $first_md5 = md5_file($first_path); -// } -// echo 'calculate md5 for next file',PHP_EOL; -// $cmp_md5 = md5_file($path); -// if ($first_md5 == $cmp_md5) -// { -// echo $path, ' is copy',PHP_EOL; -// if ($_actmode == 0) -// { -// -// } -// } -// else -// { -// echo $path, ' is unique',PHP_EOL; -// -// } -// } - -// var_dump($path); -// var_dump(stat($path)); - - - + } +echo size_humanreadable($DUP_TOTALSIZE),($_actmode == 0) ? ' can be' : ''," deduplicated.",PHP_EOL;