00001
00002
00003
00004 import os, os.path, sys, string, shutil, tempfile, fileUtils, time
00005
00006
00007 __doc__ = """
00008 Usage: tree-file-compare.py [ -h | --help ] [ -v | --verbose ] [ -r | --reverse] [ --by-name-only ] --reference APath [--mirror AnotherPath]
00009 Will scan first specified tree, in search of duplicated files (same content, different path). The resulting associations will be stored in ~/*-tree-file-compare.log files. If a second tree is specified (--mirror option), then will look for files whose content is in second tree but not in the first one, to ensure the reference tree is complete.
00010
00011 This script is useful to ensure a reference tree does not lack any content from a mirror and to know whether the mirror is up-to-date.
00012 The script can be used for example for snapshots or archives.
00013
00014 Options:
00015 -v or --verbose: set verbose mode
00016 --by-name-only: comparison is done based on names only; no MD5 checksum performed (useful when the names refer clearly to the content, as an archive filename, as opposed to snapshots)
00017 --mirror A_PATH: specifies a second tree to compare with
00018 --reverse: reverse-compare, i.e. search for files that are common to both trees rather than lacking in one (useful to ensure there is no duplicate between trees)
00019 """
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042 log_file = None
00043
00044
00045 base_write_path = os.path.expanduser("~")
00046
00047 file_base_name = time.strftime( "%Y%m%d-tree-file-compare.log", time.gmtime() )
00048
00049 log_filename = os.path.join( base_write_path, file_base_name )
00050
00051
00052
00053 def output(message):
00054 print message
00055 log_file.write("%s\n" % message)
00056
00057
00058
00059 def build_file_index_for(path):
00060 """Creates two (dictionary-based) file index for specified path."""
00061
00062
00063
00064
00065 file_paths = fileUtils.getAllRelativeFilePathsFromRoot(path)
00066
00067 content_dict={}
00068 name_dict={}
00069
00070 for f in file_paths:
00071
00072 full_path = os.path.join(path,f)
00073
00074
00075 md5 = fileUtils.getMD5codeFor(full_path)
00076
00077 if content_dict.has_key(md5):
00078 content_dict[md5] += [f]
00079 else:
00080 content_dict[md5] = [f]
00081
00082
00083 name = os.path.basename(f)
00084
00085 if name_dict.has_key(name):
00086 name_dict[name] += [f]
00087 else:
00088 name_dict[name] = [f]
00089
00090
00091
00092
00093 return (content_dict,name_dict)
00094
00095
00096
00097 def build_name_index_for(path):
00098 """Creates one (dictionary-based) name index for specified path."""
00099
00100
00101
00102 file_paths = fileUtils.getAllRelativeFilePathsFromRoot(path)
00103
00104 name_dict={}
00105
00106 for f in file_paths:
00107
00108 full_path = os.path.join(path,f)
00109
00110
00111 name = os.path.basename(f)
00112
00113 if name_dict.has_key(name):
00114 name_dict[name] += [f]
00115 else:
00116 name_dict[name] = [f]
00117
00118
00119
00120 return name_dict
00121
00122
00123
00124 def display_content_duplicates(root_path,content_index):
00125 """Displays the duplicates in specified content file index."""
00126 output( "Displaying duplicated content in tree %s:" % (root_path,))
00127 for k in content_index.keys():
00128 file_list = content_index[k]
00129 if len(file_list) > 1:
00130
00131 output( " + identical content: %s." % (file_list,) )
00132 output("")
00133
00134
00135
00136 def display_name_duplicates(root_path,name_index):
00137 """Displays the duplicates in specified name file index."""
00138 output( "Displaying duplicated names in tree %s:" % (root_path,))
00139 for k in name_index.keys():
00140 file_list = name_index[k]
00141 if len(file_list) > 1:
00142
00143 output( " + duplicated names: %s." % (file_list,) )
00144 output("")
00145
00146
00147
00148 def compare_content_trees(ref_content_index,mirror_content_index):
00149 """Compares the reference and mirror trees, based on the file content. Useful to know whether a mirror is complete."""
00150 output("Comparing reference tree with mirror tree:")
00151 for k in ref_content_index.keys():
00152 ref_files = ref_content_index[k]
00153 if mirror_content_index.has_key(k):
00154 mirror_files = mirror_content_index[k]
00155 if mirror_files != ref_files:
00156
00157 output( " + identical content for %s in reference and %s in mirror." % (ref_files,mirror_files) )
00158 else:
00159
00160 output( " (content corresponding to %s is in reference but not in mirror)" % (ref_files,) )
00161 output("")
00162
00163
00164 def compare_name_trees(ref_name_index,mirror_name_index):
00165 """Compares the reference and mirror trees, based on the file name. Useful to know whether a mirror is complete."""
00166 output("Comparing reference tree with mirror tree:")
00167 for k in ref_name_index.keys():
00168 ref_files = ref_name_index[k]
00169 if mirror_name_index.has_key(k):
00170 mirror_files = mirror_name_index[k]
00171 if mirror_files != ref_files:
00172
00173 output( " + identical name for %s in reference and %s in mirror." % (ref_files,mirror_files) )
00174 else:
00175
00176 output( " (name corresponding to %s is in reference but not in mirror)" % (ref_files,) )
00177 output("")
00178
00179
00180
00181 def check_content_completeness(ref_content_index,mirror_content_index):
00182 """Checks that all content of mirror tree is in reference tree, preferably with the same filenames."""
00183 output("Checking completeness of reference regarding the mirror:")
00184 for k in mirror_content_index.keys():
00185 if not ref_content_index.has_key(k):
00186
00187 output( " + content corresponding to %s is in mirror but not in reference." % (mirror_content_index[k],) )
00188 output("")
00189
00190
00191
00192 def check_mirror_completeness(ref_content_index,mirror_content_index):
00193 """Checks that all content of reference tree is in mirror tree, preferably with the same filenames."""
00194 output("Checking completeness of mirror regarding the reference:")
00195 for k in ref_content_index.keys():
00196 if not mirror_content_index.has_key(k):
00197
00198 output( " + content corresponding to %s is in reference but not in mirror." % (ref_content_index[k],) )
00199 output("")
00200
00201
00202
00203 def check_name_completeness(ref_name_index,mirror_name_index):
00204 """Checks that all name of mirror tree is in reference tree, preferably with the same filenames."""
00205 output("Checking completeness of reference regarding the mirror:")
00206 for k in mirror_name_index.keys():
00207 if not ref_name_index.has_key(k):
00208
00209 output( " + name corresponding to %s is in mirror but not in reference." % (mirror_name_index[k],) )
00210 output("")
00211
00212
00213
00214 def detect_common_content(ref_content_index,mirror_content_index):
00215 """Useful in the cases where one wants to check two trees partition indeed a set of files (we do not want the same content to appear more than once).
00216 Common files are detected in terms of content."""
00217 output("Looking for duplicated content between reference and mirror:")
00218 for k in mirror_content_index.keys():
00219 if ref_content_index.has_key(k):
00220 output( " + content corresponding to %s in mirror is also in reference, as %s." % (mirror_content_index[k],ref_content_index[k]) )
00221 output("")
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231 def detect_common_name(ref_name_index,mirror_name_index):
00232 """Useful in the cases where one wants to check two trees partition indeed a set of files (we do not want the same content to appear more than once).
00233 Common files are detected in terms of name."""
00234 output("Looking for duplicated names between reference and mirror:")
00235 for k in mirror_name_index.keys():
00236 if ref_name_index.has_key(k):
00237 output( " + name %s in in mirror, as %s, and in reference, as %s." % (k, mirror_name_index[k],ref_name_index[k]) )
00238 output("")
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248 def write_hashes(log_file,content_index):
00249 """Writes specified content index in specified log file."""
00250 log_file.write("Hashes:\n\n")
00251 for k in content_index.keys():
00252 log_file.write( " %s %s\n" % (k,content_index[k]))
00253 log_file.write("\n")
00254
00255
00256
00257
00258 if __name__ == '__main__':
00259
00260 help_options = [ '-h', '--help' ]
00261 verbose_options = [ '-v', '--verbose' ]
00262 by_name_options = [ '--by-name-only' ]
00263 reverse_options = [ '-r', '--reverse' ]
00264
00265 options = help_options + verbose_options + by_name_options + reverse_options
00266
00267
00268 verbose = False
00269 compare_by_content = True
00270 reverse_compare = False
00271
00272
00273
00274 saved_args = sys.argv[1:]
00275
00276
00277 sys.argv.pop(0)
00278
00279 item_count = 0
00280
00281 reference_path = None
00282 mirror_path = None
00283
00284 while len(sys.argv):
00285
00286 item = sys.argv.pop(0)
00287 item_understood = False
00288
00289
00290 item_count += 1
00291
00292 if item in help_options:
00293 item_understood = True
00294 print __doc__
00295 sys.exit( 0 )
00296
00297 if item == "--reference":
00298 item_understood = True
00299 reference_path = sys.argv.pop(0)
00300
00301
00302 if item == "--mirror":
00303 item_understood = True
00304 mirror_path = sys.argv.pop(0)
00305
00306
00307 if item in verbose_options:
00308 item_understood = True
00309 verbose = True
00310 print "Verbose mode activated."
00311
00312 if item in by_name_options:
00313 item_understood = True
00314 compare_by_content = False
00315 print "Comparison will be based on names only, rather than on content too."
00316
00317 if item in reverse_options:
00318 item_understood = True
00319 reverse_compare = True
00320 print "Reverse comparison will be performed: looking for duplicates rather than lacking files."
00321
00322 if not item_understood:
00323 print "Error, unexpected parameter: %s, stopping." % ( item, )
00324 print __doc__
00325 sys.exit( 1 )
00326
00327 if verbose:
00328 print "Reference path = %s" % ( reference_path )
00329 print "Mirror path = %s" % ( mirror_path )
00330
00331 if not reference_path:
00332 print "Error, no reference path given, stopping."
00333 print __doc__
00334 sys.exit( 2 )
00335
00336
00337 log_file = open(log_filename,"w")
00338
00339 log_file.write( "Report generated on %s.\n" % ( time.strftime("%a, %d %B %Y %H:%M:%S", time.gmtime()),) )
00340
00341 log_file.write( "Arguments specified: %s" % (saved_args,) )
00342
00343 print "Scanning reference tree..."
00344
00345 if reverse_compare:
00346 (ref_content_index,ref_name_index) = build_file_index_for( reference_path )
00347 print "Scanning mirror tree..."
00348 (mirror_content_index,mirror_name_index) = build_file_index_for( mirror_path )
00349
00350 log_file.write("\n\n ***** For reference tree %s *****\n\n" % (reference_path,))
00351 display_content_duplicates(reference_path,ref_content_index)
00352 display_name_duplicates(reference_path,ref_name_index)
00353
00354 log_file.write("\n\n ***** For mirror tree %s *****\n\n" % (mirror_path,))
00355 display_content_duplicates(mirror_path,mirror_content_index)
00356 display_name_duplicates(mirror_path,mirror_name_index)
00357
00358 detect_common_content( ref_content_index, mirror_content_index )
00359 detect_common_name( ref_name_index, mirror_name_index )
00360
00361 write_hashes(log_file,ref_content_index)
00362 write_hashes(log_file,mirror_content_index)
00363
00364 else:
00365 if compare_by_content:
00366 (ref_content_index,ref_name_index) = build_file_index_for( reference_path )
00367 log_file.write("\n\n ***** For reference tree %s *****\n\n" % (reference_path,))
00368 display_content_duplicates(reference_path,ref_content_index)
00369 display_name_duplicates(reference_path,ref_name_index)
00370 write_hashes(log_file,ref_content_index)
00371 else:
00372 ref_name_index = build_name_index_for( reference_path )
00373 log_file.write("\n\n ***** For reference tree %s *****\n\n" % (reference_path,))
00374 display_name_duplicates(reference_path,ref_name_index)
00375
00376 if mirror_path:
00377 log_file.write("\n\n ***** For mirror tree %s *****\n\n" % (mirror_path,))
00378 print "Scanning mirror tree..."
00379 if compare_by_content:
00380 (mirror_content_index,mirror_name_index) = build_file_index_for( mirror_path )
00381 display_content_duplicates(mirror_path,mirror_content_index)
00382 display_name_duplicates(mirror_path,mirror_name_index)
00383 write_hashes(log_file,mirror_content_index)
00384 compare_content_trees( ref_content_index,
00385 mirror_content_index )
00386 check_content_completeness( ref_content_index,
00387 mirror_content_index )
00388 else:
00389 mirror_name_index = build_name_index_for( mirror_path )
00390 display_name_duplicates(mirror_path,mirror_name_index)
00391
00392
00393
00394 check_name_completeness( ref_name_index,
00395 mirror_name_index )
00396
00397 log_file.write("\n\n ***** Tree comparison *****\n\n")
00398
00399 log_file.close()
00400