From c9e6e7410df170bf48109efca39b6300224190c3 Mon Sep 17 00:00:00 2001 From: direct Date: Wed, 11 May 2022 15:49:26 +1000 Subject: [PATCH] =?utf8?q?=D0=94=D0=B5=D0=B4=D1=83=D0=BF=D0=BB=D0=B8=D0=BA?= =?utf8?q?=D0=B0=D1=82=D0=BE=D1=80,=20=D0=BF=D0=B5=D1=80=D0=B2=D0=B0=D1=8F?= =?utf8?q?=20=D0=B1=D0=BE=D0=BB=D0=B5=D0=B5=20=D0=BC=D0=B5=D0=BD=D0=B5?= =?utf8?q?=D0=B5=20=D1=80=D0=B0=D0=B1=D0=BE=D1=87=D0=B0=D1=8F=20=D0=B0?= =?utf8?q?=D0=BB=D1=8C=D1=84=D0=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- dedupler/dedupler.php | 392 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 317 insertions(+), 75 deletions(-) diff --git a/dedupler/dedupler.php b/dedupler/dedupler.php index 8a43459..10a1fee 100644 --- a/dedupler/dedupler.php +++ b/dedupler/dedupler.php @@ -1,9 +1,9 @@ #!/usr/bin/env php execute($FIELD); + if(!is_link($path)) + { + $counter_files_total++; + $FIELD[0] = $path; + $FIELD[1] = filesize($path); + if ($ignore_size === FALSE || ($ignore_size !== FALSE && $FIELD[1] > $ignore_size)) + { + $counter_files ++; + $DB_AddPath->execute($FIELD); + } + } } } -echo "reading directory tree... $counter_dirs dirs $counter_files files \r"; +echo "reading directory tree... $counter_dirs dirs $counter_files files, ignored files ".$counter_files_total - $counter_files." \r"; } function help_topic($short = FALSE) { global $argv; echo 'File deduplicator ver. ', VERSION, PHP_EOL; echo 'Usage: ', $argv[0],' [-options] directory',PHP_EOL; - echo 'Searches for identical files and allows you to remove duplicates', PHP_EOL ,'by deleting them or creating links instead.', PHP_EOL; + echo 'Searches for identical files and allows you to remove duplicates', PHP_EOL ,'by deleting them or creating links instead. Symlinks and hardlinks will be ignored.', PHP_EOL; if ($short == TRUE) { echo 'Use -h for help.', PHP_EOL, PHP_EOL; return; } + $opts[] = '-I size'; + $descr[] = 'Ignore files having size smaller then specified size in this parameter. By default size is equal 0 bytes. Size can be specified in kBytes, MBytes and GBytes using suffix k, m, g. Examle -I125 -I10k -I=5G.'; $opts[] = '-n'; - $descr[] = 'Action as do nothing (default).'; + $descr[] = 'Dryrun. Search and report but do nothing (default).'; $opts[] = '-r'; $descr[] = 'Report to file in csv format, if file not set send to STDIN.'; $opts[] = '-d'; @@ -53,7 +62,7 @@ function help_topic($short = FALSE) $opts[] = '-s'; $descr[] = 'Delete duplicates and create symlinks. Duplicates must have same uid, gid and permissions, otherwise you should use -i. Conflict with -n -d -l.'; $opts[] = '-i'; - $descr[] = 'Ignore different uid, gid and permissions between duplicates. It will be used from the first file.'; + $descr[] = 'Ignore different uid, gid and permissions between duplicates when searching.'; $opts[] = '-h'; $descr[] = 'Show this help topic.'; $opts[] = '-v'; @@ -88,156 +97,389 @@ function reporter() } } + +//инициализация $counter_dirs = 0; $counter_files = 0; -$tmpfile = '/tmp/dedupler.tmp'; +$counter_files_total = 0; +$tmpfile = '/dev/shm/dedupler.tmp'; $optind = null; -$opts = getopt('nrdlsihvV'); -var_dump($opts,$argv); +$opts = getopt('nrdlsihvVI::'); +$dir = false; +$ignore_size = false; +//var_dump($opts);exit; + +function shortmd5file ($filepath) +{ + $fh = @fopen($filepath, 'r'); + if(!$fh) + { + return false; + } + $head = fread($fh, 512); + $seekstat = fseek($fh, -512, SEEK_END); + $tail = fread($fh, 512); + fclose($fh); + return md5($head.$tail); +} + +// проверка на доступность модуля sqlite для PDO +if (array_search('sqlite',PDO::getAvailableDrivers()) === false) +{ + echo 'SQLite3 module for PHP not found, but required.',PHP_EOL; + exit(3); + +} + +//обработка опций +// получение директории либо ее отсутствие if($argc > 1) { - $dir = array_pop ($argv); + $dir = array_pop ($argv); + if (!is_dir($dir)) + { + $dir = false; + } } +// если программа запущена без параметров, то вызываем короткий хелп if ($argc == 1) { help_topic(TRUE); exit(); } - +// вывести версию и выйти if (isset($opts['V'])) { echo VERSION,PHP_EOL; exit(); } +// вывести полный хелп if (isset($opts['h'])) { help_topic(); exit(); } +if (isset($opts['I'])) +{ +// if (!is_array($opts['I'])) + if ($opts['I'] === false) + { + $ignore_size = 0; + } + else + { + if (ctype_digit($opts['I'])) + { + $ignore_size = $opts['I']; + } + else + { + $sizesuffix = strtolower(substr($opts['I'], -1)); + $sizeprefix = intval(substr($opts['I'], 0, -1)); + switch ($sizesuffix) + { + case 'k': + $ignore_size = $sizeprefix * 1000; + break; + case 'm': + $ignore_size = $sizeprefix * 1000000; + break; + case 'g': + $ignore_size = $sizeprefix * 1000000000; + break; + default: + + break; + } + } + + } +} +if ($ignore_size !== false) +{ + echo "Warning! Files having size smaller then {$ignore_size} bytes will be ignored.", PHP_EOL; +} + +//var_dump($ignore_size);exit; +// проверка на использование взаимоисключающих параметров, проверяется то, что если использован только один параметр из 4х возможных $colopts['n'] = FALSE; $colopts['d'] = FALSE; $colopts['l'] = FALSE; $colopts['s'] = FALSE; - if (count(array_diff_key ($colopts,$opts)) < 3 ) { echo 'You can not use options -n, -d, -l, -s together in any combinations, because it conflict between themselves.', PHP_EOL; exit(2); } + +// в этой точке становится понятно, что будет выполняться основная работа, остается определиться с режимом дедупликации +echo 'File deduplicator ver. ', VERSION, PHP_EOL; + +// тут подразумевается режим n +if (isset($opts['i'])) +{ + echo "Ignoring UID GID and PERMS for compare files",PHP_EOL; +} $_actmode = 0; if (isset($opts['d'])) { $_actmode = 1; + echo 'File deduplicate mode: delete', PHP_EOL; + } if (isset($opts['l'])) { $_actmode = 2; + echo 'File deduplicate mode: delete + hardlink', PHP_EOL; } if (isset($opts['s'])) { $_actmode = 3; + echo 'File deduplicate mode: delete + symlink', PHP_EOL; +} +if ($_actmode == 0) +{ + echo 'File deduplicate mode: dryrun', PHP_EOL; } - - - -if (!is_dir($dir)) +// если каталог не указан или указан не существующий, то выводим ошибку и выходим. +if ($dir === false) { - echo $dir,' is not a directory!',PHP_EOL; + echo 'Directory not specified or path is not a directory!',PHP_EOL; exit(1); } + +//подготовка к сканированию if (substr($dir, -1) == DIRECTORY_SEPARATOR) { $dir = substr($dir,0,-1); } -$_FSTREE = []; if (file_exists($tmpfile)) { unlink($tmpfile); } +// открываем субд $DBH = new PDO("sqlite:".$tmpfile); $DBH->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_ASSOC); $DBH->setAttribute( PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION ); + +// подготовка структуры $DBH->exec ('CREATE TABLE fstree ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, - path text, - size INTEGER + path varchar(1024), + size bigint, + inodev varchar(64) ) '); +$DBH->exec ('CREATE TABLE inodev + ( + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, + inodev unique, + uidgid varchar(64), + tailhead varchar(64), + md5 varchar(64) + ) + '); +$DBH->exec ('CREATE INDEX filesize ON fstree (size)'); +$DBH->exec ('CREATE INDEX inodev2 ON inodev (inodev)'); +$DBH->exec ('CREATE INDEX uidgid ON inodev (uidgid)'); +$DBH->exec ('CREATE INDEX tailhead ON inodev (tailhead)'); +$DBH->exec ('CREATE INDEX md5 ON inodev (md5)'); + + -$STH = $DBH->prepare("INSERT INTO fstree (path, size) values (?, ?)"); +// подготовка запросов +$DB_AddPath = $DBH->prepare("INSERT INTO fstree (path, size) values (?, ?)"); +$DB_GetBySize = $DBH->prepare("select id, path from fstree where size = ?"); +$DB_GetSizeByGroup = $DBH->prepare('select size from fstree GROUP BY `size`'); +$DB_PurgeUnique = $DBH->prepare('delete from fstree where id IN (select id from fstree GROUP BY `size` HAVING count(*)=1)'); +$DB_AddInodev = $DBH->prepare('update fstree set inodev = ? where id = ?'); +$DB_AddMeta = $DBH->prepare('insert or ignore into inodev (inodev, uidgid) values (?, ?)'); +$DB_AddTailhead = $DBH->prepare('update inodev set tailhead = ? where inodev = ?'); +$DB_AddMd5 = $DBH->prepare('update inodev set md5 = ? where inodev = ?'); +$DB_GetAllPath = $DBH->prepare('select id, path from fstree'); +$DB_GetAllPathCount = $DBH->prepare('select count(*) from fstree'); +$DB_GetAllUidgid = $DBH->prepare('select uidgid from inodev group by uidgid'); +$DB_GetInodevsByUid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? group by inodev'); +$DB_GetInodevsBySameTailheadUid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? and tailhead in (select tailhead from inodev where uidgid = ? and tailhead not null group by tailhead HAVING count(*)>1) group by inodev'); +$DB_GetInodevsBySameMd5Uid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? and md5 in (select md5 from inodev where uidgid = ? and md5 not null group by md5 HAVING count(*)>1) group by inodev'); +$DB_GetAllMd5Dup = $DBH->prepare('select md5 from inodev where uidgid = ? and md5 not null group by md5 HAVING count(*)>1;'); +$DB_GetPathByMd5 = $DBH->prepare('select inodev, path from inodev join fstree using (inodev) where md5 = ?'); + + +//чтение дерева каталогов с заносом в базу и очисткой от заведомо уникальных файлов $DBH->beginTransaction(); $tree = get_dir_tree($dir); $DBH->commit(); -$STH = $DBH->prepare("select path from fstree where size = ?"); +echo PHP_EOL; +echo 'purge known unique files from tree...', PHP_EOL; +$DB_PurgeUnique->execute(); -echo "\n"; -$stmt_sizes = $DBH->query('select size from fstree GROUP BY `size` HAVING count(*)>1'); -while ($size = $stmt_sizes->fetch()) +//анализ оставшихся файлов, явно имеющих дубли по признаку размера +echo 'Get and store metadata for remaining files...', PHP_EOL; +$DB_GetAllPathCount->execute(); +$total_files = $DB_GetAllPathCount->fetchColumn(); +$cnt_files = 0; +$DB_GetAllPath->execute(); +while ($idpath = $DB_GetAllPath->fetch()) { - $size = implode($size); - echo PHP_EOL, "now analyzing group of files having size $size bytes:", PHP_EOL; - $STH->execute(array($size)); - while ($path = $STH->fetch()) + // получение метаданных о файле и заполнение БД + $cnt_files++; + $path = $idpath['path']; + echo $total_files," / ", $cnt_files, "\r"; + $stat = stat($path); + $inodev = $stat['dev']."+".$stat['ino']; + if (isset($opts['i'])) + { + $uidgid = 0; + } + else + { + $uidgid = $stat['uid']."+".$stat['gid']."+".$stat['mode']; + } + $DB_AddInodev->execute(array($inodev ,$idpath['id'])); + $DB_AddMeta->execute(array($inodev, $uidgid )); + +} +echo PHP_EOL; + +//анализ групп файлов с одинаковым uidgid +echo 'analyzing groups of files having similar UID and GID...', PHP_EOL; +$DB_GetAllUidgid->execute(); +while ($uidgid = $DB_GetAllUidgid->fetch()) +{ + $uidgid = implode($uidgid); + echo PHP_EOL, "now analyzing group of files having UID GID $uidgid", PHP_EOL; + + // получение и заполнение tailhead для всех файлов юзера дублирующихся по размеру + $DB_GetInodevsByUid->execute(array($uidgid)); + $DBH->beginTransaction(); + $cnt_files = 0; + while ($InodevData = $DB_GetInodevsByUid->fetch()) + { + $cnt_files++; + echo $cnt_files,"\r"; + $shortmd5 = shortmd5file($InodevData['path']); + $DB_AddTailhead->execute(array($shortmd5, $InodevData['inodev'])); + } + $DBH->commit(); + + // получение и заполнение md5 для всех файлов юзера дублирующихся по tailhead + $DB_GetInodevsBySameTailheadUid->execute(array($uidgid,$uidgid)); + $DBH->beginTransaction(); + $cnt_files = 0; + while ($InodevData = $DB_GetInodevsBySameTailheadUid->fetch()) + { + $cnt_files++; + echo $cnt_files,"\r"; + $md5 = md5_file($InodevData['path']); + $DB_AddMd5->execute(array($md5, $InodevData['inodev'])); + } + $DBH->commit(); + + // обработка копий, выявленных хешированием md5 + $DB_GetAllMd5Dup->execute(array($uidgid)); + $cnt_files = 0; + while ($md5 = $DB_GetAllMd5Dup->fetch()) { - $path = implode($path); - if (!is_link($path)) + $cnt_files++; + echo $cnt_files,"\r"; + $md5 = implode($md5); + $DB_GetPathByMd5->execute(array($md5)); + $MD5GRP = array(); + $INODEV_SIZE = array(); + while ($MD5PATH = $DB_GetPathByMd5->fetch()) { - $stat = stat($path); - if (!isset($first_flag)) - { - $first_path = $path; - unset($first_md5); - $first_inodev = $stat['dev']."+".$stat['ino']; - $first_uidgid = $stat['uid']."+".$stat['gid']; - $first_flag = TRUE; - echo $path, ' is first file',PHP_EOL; - } - else + $MD5GRP[$MD5PATH['inodev']][] = $MD5PATH['path']; + } + foreach ($MD5GRP as $inodev => $pathes) + { + $INODEV_SIZE[$inodev] = count($pathes); + } + arsort($INODEV_SIZE); + $first_inodev = array_key_first($INODEV_SIZE); + $first_inodev_path = array_shift($MD5GRP[$first_inodev]); + unset($MD5GRP[$first_inodev]); + echo "original is $first_inodev -> $first_inodev_path",PHP_EOL; + foreach ($MD5GRP as $inodev => $paths) + { + foreach ($paths as $path) { - $cmp_inodev = $stat['dev']."+".$stat['ino']; - $cmp_uidgid = $stat['uid']."+".$stat['gid']; - if ($first_inodev == $cmp_inodev) - { - echo $path, ' is hardlink',PHP_EOL; - continue; - } - if (!isset($first_md5)) - { - echo 'calculate md5 for first file',PHP_EOL; - $first_md5 = md5_file($first_path); - } - echo 'calculate md5 for next file',PHP_EOL; - $cmp_md5 = md5_file($path); - if ($first_md5 == $cmp_md5) - { - echo $path, ' is copy',PHP_EOL; - if ($mode == 0) - { - - } - } - else + echo " copy is $inodev -> $path"; + switch ($_actmode) { - echo $path, ' is unique',PHP_EOL; + case 0: + echo " ...do nothing",PHP_EOL; + break; + + default: + break; } } - + } + echo PHP_EOL; + } + + +// $DB_GetInodevsBySameMd5Uid->execute(array($uidgid,$uidgid)); +// while ($InodevData = $DB_GetInodevsBySameMd5Uid->fetch()) +// { var_dump($InodevData); +// $md5 = md5_file($InodevData['path']); +// $DB_AddMd5->execute(array($md5, $InodevData['inodev'])); +// } + +// if (!isset($first_flag)) +// { +// $first_path = $path; +// unset($first_md5); +// $first_inodev = $stat['dev']."+".$stat['ino']; +// $first_uidgid = $stat['uid']."+".$stat['gid']; +// $first_flag = TRUE; +// echo $path, ' is first file',PHP_EOL; +// } +// else +// { +// $cmp_inodev = $stat['dev']."+".$stat['ino']; +// $cmp_uidgid = $stat['uid']."+".$stat['gid']; +// if ($first_inodev == $cmp_inodev) +// { +// echo $path, ' is hardlink',PHP_EOL; +// continue; +// } +// if (!isset($first_md5)) +// { +// echo 'calculate md5 for first file',PHP_EOL; +// $first_md5 = md5_file($first_path); +// } +// echo 'calculate md5 for next file',PHP_EOL; +// $cmp_md5 = md5_file($path); +// if ($first_md5 == $cmp_md5) +// { +// echo $path, ' is copy',PHP_EOL; +// if ($_actmode == 0) +// { +// +// } +// } +// else +// { +// echo $path, ' is unique',PHP_EOL; +// +// } +// } + // var_dump($path); // var_dump(stat($path)); - } - } - unset($first_flag); + } - -- 2.39.5