#!/usr/bin/env php
<?php
-define('VERSION', '1.3a');
+define('VERSION', '2.6a');
function get_dir_tree($root)
{
global $DB_AddPath, $counter_dirs, $counter_files, $counter_files_total, $ignore_size;
}
}
}
-echo "reading directory tree... $counter_dirs dirs $counter_files files, ignored files ".$counter_files_total - $counter_files." \r";
+echo "Reading directory tree... $counter_dirs dirs $counter_files files, ignored files ".$counter_files_total - $counter_files." \r";
}
function help_topic($short = FALSE)
{
global $argv;
echo 'File deduplicator ver. ', VERSION, PHP_EOL;
echo 'Usage: ', $argv[0],' [-options] directory',PHP_EOL;
- echo 'Searches for identical files and allows you to remove duplicates', PHP_EOL ,'by deleting them or creating links instead. Symlinks and hardlinks will be ignored.', PHP_EOL;
+ echo 'Searches for identical files and allows you to remove duplicates', PHP_EOL ,'by deleting them or creating links instead. Symlinks and hardlinks',PHP_EOL,'will be ignored.', PHP_EOL;
if ($short == TRUE)
{
echo 'Use -h for help.', PHP_EOL, PHP_EOL;
return;
}
- $opts[] = '-I size';
+ $opts[] = '-I [size]';
$descr[] = 'Ignore files having size smaller then specified size in this parameter. By default size is equal 0 bytes. Size can be specified in kBytes, MBytes and GBytes using suffix k, m, g. Examle -I125 -I10k -I=5G.';
$opts[] = '-n';
$descr[] = 'Dryrun. Search and report but do nothing (default).';
- $opts[] = '-r';
- $descr[] = 'Report to file in csv format, if file not set send to STDIN.';
+ $opts[] = '-r [file]';
+ $descr[] = 'Report to file in csv format, if file not set, then send to STDERR. Example: '.$argv[0].' -rFile.csv target_dir';
$opts[] = '-d';
$descr[] = 'Delete duplicates. Conflict with -n -l -s.';
$opts[] = '-l';
$opts[] = '-h';
$descr[] = 'Show this help topic.';
$opts[] = '-v';
- $descr[] = 'Be verbose while processing.';
+ $descr[] = 'Optional output list of processed files and antions.';
$opts[] = '-V';
$descr[] = 'Show version.';
echo 'options:', PHP_EOL;
foreach ($opts as $key => $val)
{
- preg_match_all('/(.{0,60})(\ |$)/', $descr[$key],$parts);
+ preg_match_all('/(.{0,57})(\ |$)/', $descr[$key], $parts);
foreach ($parts[1] as $part)
{
if ($part == '')
{
continue;
}
- echo "\t", $val, "\t", $part, PHP_EOL;
+ if (strlen($val)> 3)
+ {
+ $_DELIM = "\t";
+ }
+ else
+ {
+ $_DELIM = "\t\t";
+ }
+ echo " ", $val, $_DELIM, $part, PHP_EOL;
$val = '';
}
}
echo PHP_EOL;
}
-function reporter()
+function reporter($group, $status, $path)
+{
+ global $_CSVREPORT;
+ if (is_resource($_CSVREPORT))
+ {
+ fputcsv($_CSVREPORT, array($group, $status, $path), ';');
+ }
+}
+
+function shortmd5file ($filepath)
+{
+ $fh = @fopen($filepath, 'r');
+ if(!$fh)
+ {
+ return false;
+ }
+ $head = fread($fh, 40);
+ $seekstat = fseek($fh, -40, SEEK_END);
+ $tail = fread($fh, 40);
+ fclose($fh);
+ return hash('crc32b', $head.$tail);
+}
+
+function size_humanreadable($size)
+{
+ $size = (int) $size;
+ if ($size > 1099511627776)
+ {
+ $hr_size = round($size / 1099511627776, 2);
+ $hr_size .= ' TiBytes';
+ }
+ elseif ($size > 1073741824)
+ {
+ $hr_size = round($size / 1073741824, 2);
+ $hr_size .= ' GiBytes';
+ }
+ elseif ($size > 1048576)
+ {
+ $hr_size = round($size / 1048576, 2);
+ $hr_size .= ' MiBytes';
+ }
+ elseif ($size > 1024)
+ {
+ $hr_size = round($size / 1024, 2);
+ $hr_size .= ' KiBytes';
+ }
+ else
+ {
+ $hr_size = $size;
+ $hr_size .= ' Bytes';
+ }
+ return $hr_size;
+}
+
+function verbose ($str)
{
global $opts;
- if (!isset($opts['r']))
+ if (isset($opts['v']))
{
- return;
+ echo $str;
}
-
}
+
//инициализация
$counter_dirs = 0;
$counter_files = 0;
$counter_files_total = 0;
$tmpfile = '/dev/shm/dedupler.tmp';
$optind = null;
-$opts = getopt('nrdlsihvVI::');
+$opts = getopt('ndlsihvVI::r::');
$dir = false;
$ignore_size = false;
//var_dump($opts);exit;
-
-function shortmd5file ($filepath)
-{
- $fh = @fopen($filepath, 'r');
- if(!$fh)
- {
- return false;
- }
- $head = fread($fh, 512);
- $seekstat = fseek($fh, -512, SEEK_END);
- $tail = fread($fh, 512);
- fclose($fh);
- return md5($head.$tail);
-}
+echo PHP_EOL;
// проверка на доступность модуля sqlite для PDO
if (array_search('sqlite',PDO::getAvailableDrivers()) === false)
exit();
}
+if (isset($opts['r']))
+{
+ if ($opts['r'] === false || is_array($opts['r']))
+ {
+ $repfile = "php://stderr";
+ }
+ else
+ {
+ $repfile = $opts['r'];
+ }
+ $_CSVREPORT = fopen($repfile, 'w');
+ if ($_CSVREPORT === false)
+ {
+ echo $repfile,'can not be opened! CSV report disabled...', PHP_EOL;
+ unset($_CSVREPORT);
+ }
+}
+
if (isset($opts['I']))
{
// if (!is_array($opts['I']))
switch ($sizesuffix)
{
case 'k':
- $ignore_size = $sizeprefix * 1000;
+ $ignore_size = $sizeprefix * 1024;
break;
case 'm':
- $ignore_size = $sizeprefix * 1000000;
+ $ignore_size = $sizeprefix * 1048576;
break;
case 'g':
- $ignore_size = $sizeprefix * 1000000000;
+ $ignore_size = $sizeprefix * 1073741824;
break;
default:
}
}
-if ($ignore_size !== false)
-{
- echo "Warning! Files having size smaller then {$ignore_size} bytes will be ignored.", PHP_EOL;
-}
//var_dump($ignore_size);exit;
{
echo "Ignoring UID GID and PERMS for compare files",PHP_EOL;
}
+
+if ($ignore_size !== false)
+{
+ echo "Warning! Files having size smaller then ", size_humanreadable($ignore_size)," will be ignored.", PHP_EOL;
+}
+
$_actmode = 0;
if (isset($opts['d']))
)
');
$DBH->exec ('CREATE INDEX filesize ON fstree (size)');
+$DBH->exec ('CREATE INDEX inodev1 ON fstree (inodev)');
$DBH->exec ('CREATE INDEX inodev2 ON inodev (inodev)');
$DBH->exec ('CREATE INDEX uidgid ON inodev (uidgid)');
$DBH->exec ('CREATE INDEX tailhead ON inodev (tailhead)');
// подготовка запросов
$DB_AddPath = $DBH->prepare("INSERT INTO fstree (path, size) values (?, ?)");
-$DB_GetBySize = $DBH->prepare("select id, path from fstree where size = ?");
-$DB_GetSizeByGroup = $DBH->prepare('select size from fstree GROUP BY `size`');
$DB_PurgeUnique = $DBH->prepare('delete from fstree where id IN (select id from fstree GROUP BY `size` HAVING count(*)=1)');
$DB_AddInodev = $DBH->prepare('update fstree set inodev = ? where id = ?');
$DB_AddMeta = $DBH->prepare('insert or ignore into inodev (inodev, uidgid) values (?, ?)');
$DB_GetAllPathCount = $DBH->prepare('select count(*) from fstree');
$DB_GetAllUidgid = $DBH->prepare('select uidgid from inodev group by uidgid');
$DB_GetInodevsByUid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? group by inodev');
+$DB_GetInodevsByUidCount = $DBH->prepare('select count(*) from (select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? group by inodev)');
$DB_GetInodevsBySameTailheadUid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? and tailhead in (select tailhead from inodev where uidgid = ? and tailhead not null group by tailhead HAVING count(*)>1) group by inodev');
-$DB_GetInodevsBySameMd5Uid = $DBH->prepare('select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? and md5 in (select md5 from inodev where uidgid = ? and md5 not null group by md5 HAVING count(*)>1) group by inodev');
+$DB_GetInodevsBySameTailheadUidCount = $DBH->prepare('select count(*) from (select inodev.id, inodev, path from inodev join fstree using (inodev) where uidgid = ? and tailhead in (select tailhead from inodev where uidgid = ? and tailhead not null group by tailhead HAVING count(*)>1) group by inodev)');
$DB_GetAllMd5Dup = $DBH->prepare('select md5 from inodev where uidgid = ? and md5 not null group by md5 HAVING count(*)>1;');
$DB_GetPathByMd5 = $DBH->prepare('select inodev, path from inodev join fstree using (inodev) where md5 = ?');
+$DB_GetSizeByInodev = $DBH->prepare('select size from fstree where inodev = ? limit 1');
//чтение дерева каталогов с заносом в базу и очисткой от заведомо уникальных файлов
$tree = get_dir_tree($dir);
$DBH->commit();
echo PHP_EOL;
-echo 'purge known unique files from tree...', PHP_EOL;
+echo 'Purge known unique files from tree...', PHP_EOL;
$DB_PurgeUnique->execute();
//анализ оставшихся файлов, явно имеющих дубли по признаку размера
// получение метаданных о файле и заполнение БД
$cnt_files++;
$path = $idpath['path'];
- echo $total_files," / ", $cnt_files, "\r";
+ echo "Total/Current: ",$total_files," / ", $cnt_files, "\r";
$stat = stat($path);
$inodev = $stat['dev']."+".$stat['ino'];
if (isset($opts['i']))
echo PHP_EOL;
//анализ групп файлов с одинаковым uidgid
-echo 'analyzing groups of files having similar UID and GID...', PHP_EOL;
+echo 'Analyzing groups of files having similar UID and GID...', PHP_EOL;
$DB_GetAllUidgid->execute();
while ($uidgid = $DB_GetAllUidgid->fetch())
{
$uidgid = implode($uidgid);
- echo PHP_EOL, "now analyzing group of files having UID GID $uidgid", PHP_EOL;
+ echo PHP_EOL, "Now analyzing group of files having UID GID $uidgid", PHP_EOL;
// получение и заполнение tailhead для всех файлов юзера дублирующихся по размеру
+ echo 'Reading head and tail of the files...',PHP_EOL;
+ $DB_GetInodevsByUidCount->execute(array($uidgid));
+ $total_files = $DB_GetInodevsByUidCount->fetchColumn();
+ $cnt_files = 0;
$DB_GetInodevsByUid->execute(array($uidgid));
$DBH->beginTransaction();
- $cnt_files = 0;
while ($InodevData = $DB_GetInodevsByUid->fetch())
{
$cnt_files++;
- echo $cnt_files,"\r";
+ echo "Total/Current: ",$total_files," / ", $cnt_files, "\r";
$shortmd5 = shortmd5file($InodevData['path']);
$DB_AddTailhead->execute(array($shortmd5, $InodevData['inodev']));
}
$DBH->commit();
+ echo PHP_EOL;
// получение и заполнение md5 для всех файлов юзера дублирующихся по tailhead
+ echo 'Reading full content of the files having the same tail and head...',PHP_EOL;
+ $DB_GetInodevsBySameTailheadUidCount->execute(array($uidgid,$uidgid));
+ $total_files = $DB_GetInodevsBySameTailheadUidCount->fetchColumn();
+ $cnt_files = 0;
$DB_GetInodevsBySameTailheadUid->execute(array($uidgid,$uidgid));
$DBH->beginTransaction();
- $cnt_files = 0;
while ($InodevData = $DB_GetInodevsBySameTailheadUid->fetch())
{
$cnt_files++;
- echo $cnt_files,"\r";
- $md5 = md5_file($InodevData['path']);
+ echo "Total/Current: ",$total_files," / ", $cnt_files, "\r";
+ $md5 = hash_file('md5',$InodevData['path']);
$DB_AddMd5->execute(array($md5, $InodevData['inodev']));
}
$DBH->commit();
+ echo PHP_EOL,PHP_EOL;
// обработка копий, выявленных хешированием md5
$DB_GetAllMd5Dup->execute(array($uidgid));
$cnt_files = 0;
+ $DUP_TOTALSIZE = 0;
+ $DUP_FSIZE = 0;
while ($md5 = $DB_GetAllMd5Dup->fetch())
{
$cnt_files++;
- echo $cnt_files,"\r";
+ verbose("Group ".$cnt_files.PHP_EOL);
$md5 = implode($md5);
$DB_GetPathByMd5->execute(array($md5));
$MD5GRP = array();
$first_inodev = array_key_first($INODEV_SIZE);
$first_inodev_path = array_shift($MD5GRP[$first_inodev]);
unset($MD5GRP[$first_inodev]);
- echo "original is $first_inodev -> $first_inodev_path",PHP_EOL;
+ verbose ("Original is $first_inodev_path".PHP_EOL);
+ reporter($cnt_files, "O", $first_inodev_path);
+ $DB_GetSizeByInodev->execute(array($first_inodev));
+ $DUP_FSIZE = $DB_GetSizeByInodev->fetchColumn();
foreach ($MD5GRP as $inodev => $paths)
{
foreach ($paths as $path)
{
- echo " copy is $inodev -> $path";
+ $DUP_TOTALSIZE = $DUP_TOTALSIZE + $DUP_FSIZE;
+ verbose (" Copy is $path");
+ reporter($cnt_files, "C", $path);
+
switch ($_actmode)
{
case 0:
- echo " ...do nothing",PHP_EOL;
-
+ verbose(" ...do nothing".PHP_EOL);
break;
-
- default:
+ case 1:
+ if (@unlink($path))
+ {
+ verbose (" ...deleted".PHP_EOL);
+ }
+ else
+ {
+ verbose (" ...delete failed!".PHP_EOL);
+ }
break;
+
+ case 2:
+ if (@rename($path, $path.".tmpfb"))
+ {
+ if(@link($first_inodev_path, $path))
+ {
+ if (@unlink($path.".tmpfb"))
+ {
+ verbose(" ...hardlinked".PHP_EOL);
+ }
+ else
+ {
+ @rename($path.".tmpfb", $path);
+ verbose (" ...hardlinking failed!".PHP_EOL);
+ }
+ }
+ else
+ {
+ @rename($path.".tmpfb", $path);
+ verbose (" ...hardlinking failed!".PHP_EOL);
+ }
+ }
+ else
+ {
+ verbose (" ...hardlinking failed!".PHP_EOL);
+ }
+ break;
+
+ case 3:
+ if (@rename($path, $path.".tmpfb"))
+ {
+ if(@symlink($first_inodev_path, $path))
+ {
+ if (@unlink($path.".tmpfb"))
+ {
+ verbose(" ...symlinked".PHP_EOL);
+ }
+ else
+ {
+ @rename($path.".tmpfb", $path);
+ verbose(" ...symlinking failed!".PHP_EOL);
+ }
+ }
+ else
+ {
+ @rename($path.".tmpfb", $path);
+ verbose(" ...symlinking failed!".PHP_EOL);
+ }
+ }
+ else
+ {
+ verbose(" ...symlinking failed!".PHP_EOL);
+ }
+ break;
}
}
}
- echo PHP_EOL;
+ verbose(PHP_EOL);
}
-
-
-// $DB_GetInodevsBySameMd5Uid->execute(array($uidgid,$uidgid));
-// while ($InodevData = $DB_GetInodevsBySameMd5Uid->fetch())
-// { var_dump($InodevData);
-// $md5 = md5_file($InodevData['path']);
-// $DB_AddMd5->execute(array($md5, $InodevData['inodev']));
-// }
-
-// if (!isset($first_flag))
-// {
-// $first_path = $path;
-// unset($first_md5);
-// $first_inodev = $stat['dev']."+".$stat['ino'];
-// $first_uidgid = $stat['uid']."+".$stat['gid'];
-// $first_flag = TRUE;
-// echo $path, ' is first file',PHP_EOL;
-// }
-// else
-// {
-// $cmp_inodev = $stat['dev']."+".$stat['ino'];
-// $cmp_uidgid = $stat['uid']."+".$stat['gid'];
-// if ($first_inodev == $cmp_inodev)
-// {
-// echo $path, ' is hardlink',PHP_EOL;
-// continue;
-// }
-// if (!isset($first_md5))
-// {
-// echo 'calculate md5 for first file',PHP_EOL;
-// $first_md5 = md5_file($first_path);
-// }
-// echo 'calculate md5 for next file',PHP_EOL;
-// $cmp_md5 = md5_file($path);
-// if ($first_md5 == $cmp_md5)
-// {
-// echo $path, ' is copy',PHP_EOL;
-// if ($_actmode == 0)
-// {
-//
-// }
-// }
-// else
-// {
-// echo $path, ' is unique',PHP_EOL;
-//
-// }
-// }
-
-// var_dump($path);
-// var_dump(stat($path));
-
-
-
+
}
+echo size_humanreadable($DUP_TOTALSIZE),($_actmode == 0) ? ' can be' : ''," deduplicated.",PHP_EOL;