#!/usr/bin/env php
<?php
-define('VERSION', '1.0a');
+define('VERSION', '1.3a');
function get_dir_tree($root)
{
- global $STH, $counter_dirs, $counter_files, $fh;
+ global $DB_AddPath, $counter_dirs, $counter_files, $counter_files_total, $ignore_size;
if (is_dir($root) === FALSE)
{
return FALSE;
}
else
{
- $counter_files ++;
- $FIELD[0] = $path;
- $FIELD[1] = filesize($path);
- $STH->execute($FIELD);
+ if(!is_link($path))
+ {
+ $counter_files_total++;
+ $FIELD[0] = $path;
+ $FIELD[1] = filesize($path);
+ if ($ignore_size === FALSE || ($ignore_size !== FALSE && $FIELD[1] > $ignore_size))
+ {
+ $counter_files ++;
+ $DB_AddPath->execute($FIELD);
+ }
+ }
}
}
-echo "reading directory tree... $counter_dirs dirs $counter_files files \r";
+echo "reading directory tree... $counter_dirs dirs $counter_files files, ignored files ".$counter_files_total - $counter_files." \r";
}
function help_topic($short = FALSE)
{
global $argv;
echo 'File deduplicator ver. ', VERSION, PHP_EOL;
echo 'Usage: ', $argv[0],' [-options] directory',PHP_EOL;
- echo 'Searches for identical files and allows you to remove duplicates', PHP_EOL ,'by deleting them or creating links instead.', PHP_EOL;
+ echo 'Searches for identical files and allows you to remove duplicates', PHP_EOL ,'by deleting them or creating links instead. Symlinks and hardlinks will be ignored.', PHP_EOL;
if ($short == TRUE)
{
echo 'Use -h for help.', PHP_EOL, PHP_EOL;
return;
}
+ $opts[] = '-I size';
+ $descr[] = 'Ignore files having size smaller then specified size in this parameter. By default size is equal 0 bytes. Size can be specified in kBytes, MBytes and GBytes using suffix k, m, g. Examle -I125 -I10k -I=5G.';
$opts[] = '-n';
- $descr[] = 'Action as do nothing (default).';
+ $descr[] = 'Dryrun. Search and report but do nothing (default).';
$opts[] = '-r';
$descr[] = 'Report to file in csv format, if file not set send to STDIN.';
$opts[] = '-d';
$opts[] = '-s';
$descr[] = 'Delete duplicates and create symlinks. Duplicates must have same uid, gid and permissions, otherwise you should use -i. Conflict with -n -d -l.';
$opts[] = '-i';
- $descr[] = 'Ignore different uid, gid and permissions between duplicates. It will be used from the first file.';
+ $descr[] = 'Ignore different uid, gid and permissions between duplicates when searching.';
$opts[] = '-h';
$descr[] = 'Show this help topic.';
$opts[] = '-v';
}
}
+
+//инициализация
$counter_dirs = 0;
$counter_files = 0;
-$tmpfile = '/tmp/dedupler.tmp';
+$counter_files_total = 0;
+$tmpfile = '/dev/shm/dedupler.tmp';
$optind = null;
-$opts = getopt('nrdlsihvV');
-var_dump($opts,$argv);
+$opts = getopt('nrdlsihvVI::');
+$dir = false;
+$ignore_size = false;
+//var_dump($opts);exit;
+
+function shortmd5file ($filepath)
+{
+ $fh = @fopen($filepath, 'r');
+ if(!$fh)
+ {
+ return false;
+ }
+ $head = fread($fh, 512);
+ $seekstat = fseek($fh, -512, SEEK_END);
+ $tail = fread($fh, 512);
+ fclose($fh);
+ return md5($head.$tail);
+}
+
+// проверка на доступность модуля sqlite для PDO
+if (array_search('sqlite',PDO::getAvailableDrivers()) === false)
+{
+ echo 'SQLite3 module for PHP not found, but required.',PHP_EOL;
+ exit(3);
+
+}
+
+//обработка опций
+// получение директории либо ее отсутствие
if($argc > 1)
{
- $dir = array_pop ($argv);
+ $dir = array_pop ($argv);
+ if (!is_dir($dir))
+ {
+ $dir = false;
+ }
}
+// если программа запущена без параметров, то вызываем короткий хелп
if ($argc == 1)
{
help_topic(TRUE);
exit();
}
-
+// вывести версию и выйти
if (isset($opts['V']))
{
echo VERSION,PHP_EOL;
exit();
}
+// вывести полный хелп
if (isset($opts['h']))
{
help_topic();
exit();
}
+if (isset($opts['I']))
+{
+// if (!is_array($opts['I']))
+ if ($opts['I'] === false)
+ {
+ $ignore_size = 0;
+ }
+ else
+ {
+ if (ctype_digit($opts['I']))
+ {
+ $ignore_size = $opts['I'];
+ }
+ else
+ {
+ $sizesuffix = strtolower(substr($opts['I'], -1));
+ $sizeprefix = intval(substr($opts['I'], 0, -1));
+ switch ($sizesuffix)
+ {
+ case 'k':
+ $ignore_size = $sizeprefix * 1000;
+ break;
+ case 'm':
+ $ignore_size = $sizeprefix * 1000000;
+ break;
+ case 'g':
+ $ignore_size = $sizeprefix * 1000000000;
+ break;
+ default:
+
+ break;
+ }
+ }
+
+ }
+}
+if ($ignore_size !== false)
+{
+ echo "Warning! Files having size smaller then {$ignore_size} bytes will be ignored.", PHP_EOL;
+}
+
+//var_dump($ignore_size);exit;
+// проверка на использование взаимоисключающих параметров, проверяется то, что если использован только один параметр из 4х возможных
$colopts['n'] = FALSE;
$colopts['d'] = FALSE;
$colopts['l'] = FALSE;
$colopts['s'] = FALSE;
-
if (count(array_diff_key ($colopts,$opts)) < 3 )
{
echo 'You can not use options -n, -d, -l, -s together in any combinations, because it conflict between themselves.', PHP_EOL;
exit(2);
}
+
+// в этой точке становится понятно, что будет выполняться основная работа, остается определиться с режимом дедупликации
+echo 'File deduplicator ver. ', VERSION, PHP_EOL;
+
+// тут подразумевается режим n
+if (isset($opts['i']))
+{
+ echo "Ignoring UID GID and PERMS for compare files",PHP_EOL;
+}
$_actmode = 0;
if (isset($opts['d']))
{
$_actmode = 1;
+ echo 'File deduplicate mode: delete', PHP_EOL;
+
}
if (isset($opts['l']))
{
$_actmode = 2;
+ echo 'File deduplicate mode: delete + hardlink', PHP_EOL;
}
if (isset($opts['s']))
{
$_actmode = 3;
+ echo 'File deduplicate mode: delete + symlink', PHP_EOL;
+}
+if ($_actmode == 0)
+{
+ echo 'File deduplicate mode: dryrun', PHP_EOL;
}
-
-
-
-if (!is_dir($dir))
+// если каталог не указан или указан не существующий, то выводим ошибку и выходим.
+if ($dir === false)
{
- echo $dir,' is not a directory!',PHP_EOL;
+ echo 'Directory not specified or path is not a directory!',PHP_EOL;
exit(1);
}
+
+//подготовка к сканированию
if (substr($dir, -1) == DIRECTORY_SEPARATOR)
{
$dir = substr($dir,0,-1);
}
-$_FSTREE = [];
if (file_exists($tmpfile))
{
unlink($tmpfile);
}
+// открываем субд
$DBH = new PDO("sqlite:".$tmpfile);
$DBH->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_ASSOC);
$DBH->setAttribute( PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION );
+
+// подготовка структуры
$DBH->exec ('CREATE TABLE fstree
(
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
- path text,
- size INTEGER
+ path varchar(1024),
+ size bigint,
+ inodev varchar(64)
)
');
+$DBH->exec ('CREATE TABLE inodev
+ (
+ id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
+ inodev unique,
+ uidgid varchar(64),
+ tailhead varchar(64),
+ md5 varchar(64)
+ )
+ ');
+$DBH->exec ('CREATE INDEX filesize ON fstree (size)');
+$DBH->exec ('CREATE INDEX inodev2 ON inodev (inodev)');
+$DBH->exec ('CREATE INDEX uidgid ON inodev (uidgid)');
+$DBH->exec ('CREATE INDEX tailhead ON inodev (tailhead)');
+$DBH->exec ('CREATE INDEX md5 ON inodev (md5)');
+
+
-$STH = $DBH->prepare("INSERT INTO fstree (path, size) values (?, ?)");
+// подготовка запросов
+$DB_AddPath = $DBH->prepare("INSERT INTO fstree (path, size) values (?, ?)");
+$DB_GetBySize = $DBH->prepare("select id, path from fstree where size = ?");
+$DB_GetSizeByGroup = $DBH->prepare('select size from fstree GROUP BY `size`');
+$DB_PurgeUnique = $DBH->prepare('delete from fstree where id IN (select id from fstree GROUP BY `size` HAVING count(*)=1)');
+$DB_AddInodev = $DBH->prepare('update fstree set inodev = ? where id = ?');
+$DB_AddMeta = $DBH->prepare('insert or ignore into inodev (inodev, uidgid) values (?, ?)');
+$DB_AddTailhead = $DBH->prepare('update inodev set tailhead = ? where inodev = ?');
+$DB_AddMd5 = $DBH->prepare('update inodev set md5 = ? where inodev = ?');
+$DB_GetAllPath = $DBH->prepare('select id, path from fstree');
+$DB_GetAllPathCount = $DBH->prepare('select count(*) from fstree');
+$DB_GetAllUidgid = $DBH->prepare('select uidgid from inodev group by uidgid');
+$DB_GetInodevsByUid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? group by inodev');
+$DB_GetInodevsBySameTailheadUid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? and tailhead in (select tailhead from inodev where uidgid = ? and tailhead not null group by tailhead HAVING count(*)>1) group by inodev');
+$DB_GetInodevsBySameMd5Uid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? and md5 in (select md5 from inodev where uidgid = ? and md5 not null group by md5 HAVING count(*)>1) group by inodev');
+$DB_GetAllMd5Dup = $DBH->prepare('select md5 from inodev where uidgid = ? and md5 not null group by md5 HAVING count(*)>1;');
+$DB_GetPathByMd5 = $DBH->prepare('select inodev, path from inodev join fstree using (inodev) where md5 = ?');
+
+
+//чтение дерева каталогов с заносом в базу и очисткой от заведомо уникальных файлов
$DBH->beginTransaction();
$tree = get_dir_tree($dir);
$DBH->commit();
-$STH = $DBH->prepare("select path from fstree where size = ?");
+echo PHP_EOL;
+echo 'purge known unique files from tree...', PHP_EOL;
+$DB_PurgeUnique->execute();
-echo "\n";
-$stmt_sizes = $DBH->query('select size from fstree GROUP BY `size` HAVING count(*)>1');
-while ($size = $stmt_sizes->fetch())
+//анализ оставшихся файлов, явно имеющих дубли по признаку размера
+echo 'Get and store metadata for remaining files...', PHP_EOL;
+$DB_GetAllPathCount->execute();
+$total_files = $DB_GetAllPathCount->fetchColumn();
+$cnt_files = 0;
+$DB_GetAllPath->execute();
+while ($idpath = $DB_GetAllPath->fetch())
{
- $size = implode($size);
- echo PHP_EOL, "now analyzing group of files having size $size bytes:", PHP_EOL;
- $STH->execute(array($size));
- while ($path = $STH->fetch())
+ // получение метаданных о файле и заполнение БД
+ $cnt_files++;
+ $path = $idpath['path'];
+ echo $total_files," / ", $cnt_files, "\r";
+ $stat = stat($path);
+ $inodev = $stat['dev']."+".$stat['ino'];
+ if (isset($opts['i']))
+ {
+ $uidgid = 0;
+ }
+ else
+ {
+ $uidgid = $stat['uid']."+".$stat['gid']."+".$stat['mode'];
+ }
+ $DB_AddInodev->execute(array($inodev ,$idpath['id']));
+ $DB_AddMeta->execute(array($inodev, $uidgid ));
+
+}
+echo PHP_EOL;
+
+//анализ групп файлов с одинаковым uidgid
+echo 'analyzing groups of files having similar UID and GID...', PHP_EOL;
+$DB_GetAllUidgid->execute();
+while ($uidgid = $DB_GetAllUidgid->fetch())
+{
+ $uidgid = implode($uidgid);
+ echo PHP_EOL, "now analyzing group of files having UID GID $uidgid", PHP_EOL;
+
+ // получение и заполнение tailhead для всех файлов юзера дублирующихся по размеру
+ $DB_GetInodevsByUid->execute(array($uidgid));
+ $DBH->beginTransaction();
+ $cnt_files = 0;
+ while ($InodevData = $DB_GetInodevsByUid->fetch())
+ {
+ $cnt_files++;
+ echo $cnt_files,"\r";
+ $shortmd5 = shortmd5file($InodevData['path']);
+ $DB_AddTailhead->execute(array($shortmd5, $InodevData['inodev']));
+ }
+ $DBH->commit();
+
+ // получение и заполнение md5 для всех файлов юзера дублирующихся по tailhead
+ $DB_GetInodevsBySameTailheadUid->execute(array($uidgid,$uidgid));
+ $DBH->beginTransaction();
+ $cnt_files = 0;
+ while ($InodevData = $DB_GetInodevsBySameTailheadUid->fetch())
+ {
+ $cnt_files++;
+ echo $cnt_files,"\r";
+ $md5 = md5_file($InodevData['path']);
+ $DB_AddMd5->execute(array($md5, $InodevData['inodev']));
+ }
+ $DBH->commit();
+
+ // обработка копий, выявленных хешированием md5
+ $DB_GetAllMd5Dup->execute(array($uidgid));
+ $cnt_files = 0;
+ while ($md5 = $DB_GetAllMd5Dup->fetch())
{
- $path = implode($path);
- if (!is_link($path))
+ $cnt_files++;
+ echo $cnt_files,"\r";
+ $md5 = implode($md5);
+ $DB_GetPathByMd5->execute(array($md5));
+ $MD5GRP = array();
+ $INODEV_SIZE = array();
+ while ($MD5PATH = $DB_GetPathByMd5->fetch())
{
- $stat = stat($path);
- if (!isset($first_flag))
- {
- $first_path = $path;
- unset($first_md5);
- $first_inodev = $stat['dev']."+".$stat['ino'];
- $first_uidgid = $stat['uid']."+".$stat['gid'];
- $first_flag = TRUE;
- echo $path, ' is first file',PHP_EOL;
- }
- else
+ $MD5GRP[$MD5PATH['inodev']][] = $MD5PATH['path'];
+ }
+ foreach ($MD5GRP as $inodev => $pathes)
+ {
+ $INODEV_SIZE[$inodev] = count($pathes);
+ }
+ arsort($INODEV_SIZE);
+ $first_inodev = array_key_first($INODEV_SIZE);
+ $first_inodev_path = array_shift($MD5GRP[$first_inodev]);
+ unset($MD5GRP[$first_inodev]);
+ echo "original is $first_inodev -> $first_inodev_path",PHP_EOL;
+ foreach ($MD5GRP as $inodev => $paths)
+ {
+ foreach ($paths as $path)
{
- $cmp_inodev = $stat['dev']."+".$stat['ino'];
- $cmp_uidgid = $stat['uid']."+".$stat['gid'];
- if ($first_inodev == $cmp_inodev)
- {
- echo $path, ' is hardlink',PHP_EOL;
- continue;
- }
- if (!isset($first_md5))
- {
- echo 'calculate md5 for first file',PHP_EOL;
- $first_md5 = md5_file($first_path);
- }
- echo 'calculate md5 for next file',PHP_EOL;
- $cmp_md5 = md5_file($path);
- if ($first_md5 == $cmp_md5)
- {
- echo $path, ' is copy',PHP_EOL;
- if ($mode == 0)
- {
-
- }
- }
- else
+ echo " copy is $inodev -> $path";
+ switch ($_actmode)
{
- echo $path, ' is unique',PHP_EOL;
+ case 0:
+ echo " ...do nothing",PHP_EOL;
+ break;
+
+ default:
+ break;
}
}
-
+ }
+ echo PHP_EOL;
+ }
+
+
+// $DB_GetInodevsBySameMd5Uid->execute(array($uidgid,$uidgid));
+// while ($InodevData = $DB_GetInodevsBySameMd5Uid->fetch())
+// { var_dump($InodevData);
+// $md5 = md5_file($InodevData['path']);
+// $DB_AddMd5->execute(array($md5, $InodevData['inodev']));
+// }
+
+// if (!isset($first_flag))
+// {
+// $first_path = $path;
+// unset($first_md5);
+// $first_inodev = $stat['dev']."+".$stat['ino'];
+// $first_uidgid = $stat['uid']."+".$stat['gid'];
+// $first_flag = TRUE;
+// echo $path, ' is first file',PHP_EOL;
+// }
+// else
+// {
+// $cmp_inodev = $stat['dev']."+".$stat['ino'];
+// $cmp_uidgid = $stat['uid']."+".$stat['gid'];
+// if ($first_inodev == $cmp_inodev)
+// {
+// echo $path, ' is hardlink',PHP_EOL;
+// continue;
+// }
+// if (!isset($first_md5))
+// {
+// echo 'calculate md5 for first file',PHP_EOL;
+// $first_md5 = md5_file($first_path);
+// }
+// echo 'calculate md5 for next file',PHP_EOL;
+// $cmp_md5 = md5_file($path);
+// if ($first_md5 == $cmp_md5)
+// {
+// echo $path, ' is copy',PHP_EOL;
+// if ($_actmode == 0)
+// {
+//
+// }
+// }
+// else
+// {
+// echo $path, ' is unique',PHP_EOL;
+//
+// }
+// }
+
// var_dump($path);
// var_dump(stat($path));
- }
- }
- unset($first_flag);
+
}
-