add a dotfile check and only delete hidden (.dotfile's) during a destructive search.
[gentoolkit.git] / pym / gentoolkit / eclean / search.py
1 #!/usr/bin/python
2
3 # Copyright 2003-2010 Gentoo Foundation
4 # Distributed under the terms of the GNU General Public License v2
5
6
7 from __future__ import print_function
8
9
10 import os
11 import re
12 import stat
13 import sys
14 from functools import partial
15
16 import portage
17
18 import gentoolkit
19 import gentoolkit.pprinter as pp
20 from gentoolkit.eclean.exclude import (exclDictMatchCP, exclDictExpand,
21         exclDictExpandPkgname, exclMatchFilename)
22
23
24 # Misc. shortcuts to some portage stuff:
25 port_settings = portage.settings
26 pkgdir = port_settings["PKGDIR"]
27
28 err = sys.stderr
29 deprecated_message=""""Deprecation Warning: Installed package: %s
30         Is no longer in the tree or an installed overlay"""
31 DEPRECATED = pp.warn(deprecated_message)
32
33 debug_modules = []
34
35
36 def dprint(module, message):
37         if module in debug_modules:
38                 print(message)
39
40
41 def get_distdir():
42         """Returns DISTDIR if sane, else barfs."""
43
44         d = portage.settings["DISTDIR"]
45         if not os.path.isdir(d):
46                 e = pp.error("%s does not appear to be a directory.\n" % d)
47                 e += pp.error("Please set DISTDIR to a sane value.\n")
48                 e += pp.error("(Check your /etc/make.conf and environment).")
49                 print( e, file=sys.stderr)
50                 exit(1)
51         return d
52
53 distdir = get_distdir()
54
55
56 class DistfilesSearch(object):
57         """
58
59                 @param output: verbose output method or (lambda x: None) to turn off
60                 @param vardb: defaults to portage.db[portage.root]["vartree"].dbapi
61                                         is overridden for testing.
62                 @param portdb: defaults to portage.portdb and is overriden for testing.
63 """
64
65         def __init__(self,
66                         output,
67                         portdb=portage.portdb,
68                         vardb=portage.db[portage.root]["vartree"].dbapi,
69                         ):
70                 self.vardb =vardb
71                 self.portdb = portdb
72                 self.output = output
73                 self.installed_cpvs = None
74
75         def findDistfiles(self,
76                         exclude=None,
77                         destructive=False,
78                         fetch_restricted=False,
79                         package_names=False,
80                         time_limit=0,
81                         size_limit=0,
82                         _distdir=distdir,
83                         deprecate=False,
84                         extra_checks=()
85                         ):
86                 """Find all obsolete distfiles.
87
88                 XXX: what about cvs ebuilds?
89                 I should install some to see where it goes...
90
91                 @param exclude: an exclusion dict as defined in
92                                 exclude.parseExcludeFile class.
93                 @param destructive: boolean, defaults to False
94                 @param fetch_restricted: boolean, defaults to False
95                 @param package_names: boolean, defaults to False.
96                 @param time_limit: integer time value as returned by parseTime()
97                 @param size_limit: integer value of max. file size to keep or 0 to ignore.
98                 @param _distdir: path to the distfiles dir being checked, defaults to portage.
99                 @param deprecate: bool to control checking the clean dict. files for exclusion
100
101                 @rtype: dict
102                 @return dict. of package files to clean i.e. {'cat/pkg-ver.tbz2': [filename],}
103                 """
104                 if exclude is None:
105                         exclude = {}
106                 clean_me = {}
107                 pkgs = {}
108                 saved = {}
109                 deprecated = {}
110                 installed_included = False
111                 # create a big CPV->SRC_URI dict of packages
112                 # whose distfiles should be kept
113                 if (not destructive) or fetch_restricted:
114                         self.output("...non-destructive type search")
115                         pkgs, _deprecated = self._non_destructive(destructive, fetch_restricted)
116                         deprecated.update(_deprecated)
117                         installed_included = True
118                 if destructive:
119                         self.output("...destructive type search: %d packages already found" %len(pkgs))
120                         pkgs, _deprecated = self._destructive(package_names,
121                                         exclude, pkgs, installed_included)
122                         deprecated.update(_deprecated)
123                 # gather the files to be cleaned
124                 self.output("...checking limits for %d ebuild sources"
125                                 %len(pkgs))
126
127                 checks = self._get_default_checks(size_limit, time_limit, exclude, destructive)
128                 checks.extend(extra_checks)
129                 clean_me = self._check_limits(_distdir, checks, clean_me)
130                 # remove any protected files from the list
131                 self.output("...removing protected sources from %s candidates to clean"
132                                 %len(clean_me))
133                 clean_me = self._remove_protected(pkgs, clean_me)
134                 if not deprecate and len(exclude) and len(clean_me):
135                         self.output("...checking final for exclusion from " +\
136                                 "%s remaining candidates to clean" %len(clean_me))
137                         clean_me, saved = self._check_excludes(exclude, clean_me)
138                 return clean_me, saved, deprecated
139
140
141 ####################### begin _check_limits code block
142
143         def _get_default_checks(self, size_limit, time_limit, excludes, destructive):
144                 #checks =[(self._isreg_check_, "is_reg_check")]
145                 checks =[self._isreg_check_]
146                 if 'filenames' in excludes:
147                         #checks.append((partial(self._filenames_check_, excludes), "Filenames_check"))
148                         checks.append(partial(self._filenames_check_, excludes))
149                 else:
150                         self.output("   - skipping exclude filenames check")
151                 if size_limit:
152                         #checks.append((partial(self._size_check_, size_limit), "size_check"))
153                         checks.append(partial(self._size_check_, size_limit))
154                 else:
155                         self.output("   - skipping size limit check")
156                 if time_limit:
157                         #print("time_limit = ", time_limit/1000000,"M sec")
158                         #checks.append((partial(self._time_check_, time_limit), "time_check"))
159                         checks.append(partial(self._time_check_, time_limit))
160                 else:
161                         self.output("   - skipping time limit check")
162                 if destructive:
163                         self.output("   - skipping dot files check")
164                 else:
165                         checks.append(self._dotfile_check_)
166                 return checks
167
168
169         def _check_limits(self,
170                         _distdir,
171                         checks,
172                         clean_me=None
173                         ):
174                 """Checks files if they exceed size and/or time_limits, etc.
175
176                 To start with everything is considered dirty and is excluded
177                 only if it matches some condition.
178                 """
179                 if clean_me is None:
180                         clean_me = {}
181                 for file in os.listdir(_distdir):
182                         filepath = os.path.join(_distdir, file)
183                         try:
184                                 file_stat = os.lstat(filepath)
185                         except EnvironmentError:
186                                 continue
187                         is_dirty = False
188                         #for check, check_name in checks:
189                         for check in checks:
190                                 should_break, is_dirty = check(file_stat, file)
191                                 if should_break:
192                                         break
193
194                         if is_dirty:
195                                 #print( "%s Adding file to clean_list:" %check_name, file)
196                                 clean_me[file]=[filepath]
197                 return clean_me
198
199         @staticmethod
200         def _isreg_check_(file_stat, file):
201                 """check if file is a regular file."""
202                 is_reg_file = stat.S_ISREG(file_stat[stat.ST_MODE])
203                 return  not is_reg_file, is_reg_file
204
205         @staticmethod
206         def _size_check_(size_limit, file_stat, file):
207                 """checks if the file size exceeds the size_limit"""
208                 if (file_stat[stat.ST_SIZE] >= size_limit):
209                         #print( "size mismatch ", file, file_stat[stat.ST_SIZE])
210                         return True, False
211                 return False, True
212
213         @staticmethod
214         def _time_check_(time_limit, file_stat, file):
215                 """checks if the file exceeds the time_limit
216                 (think forward, not back, time keeps increasing)"""
217                 if (file_stat[stat.ST_MTIME] >= time_limit):
218                         #print( "time match too young ", file, file_stat[stat.ST_MTIME]/1000000,"M sec.")
219                         return True, False
220                 #print( "time match too old", file, file_stat[stat.ST_MTIME]/1000000,"M sec.")
221                 return False, True
222
223         @staticmethod
224         def _filenames_check_(exclude, file_stat, file):
225                 """checks if the file matches an exclusion file listing"""
226                 # Try to match file name directly
227                 if file in exclude['filenames']:
228                         return True, False
229                 # See if file matches via regular expression matching
230                 else:
231                         file_match = False
232                         for file_entry in exclude['filenames']:
233                                 if exclude['filenames'][file_entry].match(file):
234                                         file_match = True
235                                         break
236                 if file_match:
237                         #print( "filename match ", file)
238                         return True, False
239                 return False, True
240
241         @staticmethod
242         def _dotfile_check_(file_stat, file):
243                 """check if file is a regular file."""
244                 head, tail = os.path.split(file)
245                 if tail:
246                         is_dot_file = tail.startswith('.')
247                 return  is_dot_file, not is_dot_file
248
249 ####################### end _check_limits code block
250
251         @staticmethod
252         def _remove_protected(
253                         pkgs,
254                         clean_me
255                         ):
256                 """Remove files owned by some protected packages.
257
258                 @returns packages to clean
259                 @rtype: dictionary
260                 """
261                 for cpv in pkgs:
262                         uris = pkgs[cpv].split()
263                         uris.reverse()
264                         while uris:
265                                 uri = uris.pop()
266                                 if uris and uris[-1] == "->":
267                                         operator = uris.pop()
268                                         file = uris.pop()
269                                 else:
270                                         file = os.path.basename(uri)
271                                 if file in clean_me:
272                                         del clean_me[file]
273                         # no need to waste IO time if there is nothing left to clean
274                         if not len(clean_me):
275                                 return clean_me
276                 return clean_me
277
278         def _non_destructive(self,
279                         destructive,
280                         fetch_restricted,
281                         pkgs_ = None,
282                         hosts_cpvs=None
283                         ):
284                 """performs the non-destructive checks
285
286                 @param destructive: boolean
287                 @param pkgs_: starting dictionary to add to
288                                 defaults to {}.
289
290                 @returns packages and thier SRC_URI's: {cpv: src_uri,}
291                 @rtype: dictionary
292                 """
293                 if pkgs_ is None:
294                         pkgs = {}
295                 else:
296                         pkgs = pkgs_.copy()
297                 deprecated = {}
298                 # the following code block was split to optimize for speed
299                 # list all CPV from portree (yeah, that takes time...)
300                 self.output("   - getting complete ebuild list")
301                 cpvs = set(self.portdb.cpv_all())
302                 installed_cpvs = set(self.vardb.cpv_all())
303                 # now add any installed cpv's that are not in the tree or overlays
304                 cpvs.update(installed_cpvs)
305                 # Add any installed cpvs from hosts on the network, if any
306                 if hosts_cpvs:
307                         cpvs.update(hosts_cpvs)
308                         installed_cpvs.update(hosts_cpvs)
309                 if fetch_restricted and destructive:
310                         self.output("   - getting source file names " +
311                                 "for %d installed ebuilds" %len(installed_cpvs))
312                         pkgs, _deprecated = self._unrestricted(pkgs, installed_cpvs)
313                         deprecated.update(_deprecated)
314                         # remove the installed cpvs then check the remaining for fetch restiction
315                         cpvs.difference_update(installed_cpvs)
316                         self.output("   - getting fetch-restricted source file names " +
317                                 "for %d remaining ebuilds" %len(cpvs))
318                         pkgs, _deprecated = self._fetch_restricted(pkgs, cpvs)
319                         deprecated.update(_deprecated)
320                         # save the installed cpv list to re-use in _destructive()
321                         self.installed_cpvs = installed_cpvs.copy()
322                 else:
323                         self.output("   - getting source file names " +
324                                 "for %d ebuilds" %len(cpvs))
325                         pkgs, _deprecated = self._unrestricted(pkgs, cpvs)
326                         deprecated.update(_deprecated)
327                 return pkgs, deprecated
328
329         def _fetch_restricted(self, pkgs_, cpvs):
330                 """perform fetch restricted non-destructive source
331                 filename lookups
332
333                 @param pkgs_: starting dictionary to add to
334                 @param cpvs: set of (cat/pkg-ver, ...) identifiers
335
336                 @return a new pkg dictionary
337                 @rtype: dictionary
338                 """
339                 if pkgs_ is None:
340                         pkgs = {}
341                 else:
342                         pkgs = pkgs_.copy()
343                 deprecated = {}
344                 for cpv in cpvs:
345                         # get SRC_URI and RESTRICT from aux_get
346                         try: # main portdb
347                                 (src_uri,restrict) = \
348                                         self.portdb.aux_get(cpv,["SRC_URI","RESTRICT"])
349                                 # keep fetch-restricted check
350                                 # inside try so it is bypassed on KeyError
351                                 if 'fetch' in restrict:
352                                         pkgs[cpv] = src_uri
353                         except KeyError:
354                                 try: # installed vardb
355                                         (src_uri,restrict) = \
356                                                 self.vardb.aux_get(cpv,["SRC_URI","RESTRICT"])
357                                         deprecated[cpv] = src_uri
358                                         self.output(DEPRECATED %cpv)
359                                         # keep fetch-restricted check
360                                         # inside try so it is bypassed on KeyError
361                                         if 'fetch' in restrict:
362                                                 pkgs[cpv] = src_uri
363                                 except KeyError:
364                                         self.output("   - Key Error looking up: " + cpv)
365                 return pkgs, deprecated
366
367         def _unrestricted(self, pkgs_, cpvs):
368                 """Perform unrestricted source filenames lookups
369
370                 @param pkgs_: starting packages dictionary
371                 @param cpvs: set of (cat/pkg-ver, ...) identifiers
372
373                 @return a new pkg dictionary
374                 @rtype: dictionary
375                 """
376                 if pkgs_ is None:
377                         pkgs = {}
378                 else:
379                         pkgs = pkgs_.copy()
380                 deprecated = {}
381                 for cpv in cpvs:
382                         # get SRC_URI from aux_get
383                         try:
384                                 pkgs[cpv] = self.portdb.aux_get(cpv,["SRC_URI"])[0]
385                         except KeyError:
386                                 try: # installed vardb
387                                         pkgs[cpv] = self.vardb.aux_get(cpv,["SRC_URI"])[0]
388                                         deprecated[cpv] = pkgs[cpv]
389                                         self.output(DEPRECATED %cpv)
390                                 except KeyError:
391                                         self.output("   - Key Error looking up: " + cpv)
392                 return pkgs, deprecated
393
394         def _destructive(self,
395                         package_names,
396                         exclude,
397                         pkgs_=None,
398                         installed_included=False
399                         ):
400                 """Builds on pkgs according to input options
401
402                 @param package_names: boolean
403                 @param exclude: an exclusion dict as defined in
404                                 exclude.parseExcludeFile class.
405                 @param pkgs: starting dictionary to add to
406                                 defaults to {}.
407                 @param installed_included: bool. pkgs already
408                                 has the installed cpv's added.
409
410                 @returns pkgs: {cpv: src_uri,}
411                 """
412                 if pkgs_ is None:
413                         pkgs = {}
414                 else:
415                         pkgs = pkgs_.copy()
416                 deprecated = {}
417                 pkgset = set()
418                 if not installed_included:
419                         if not package_names:
420                                 # list all installed CPV's from vartree
421                                 #print( "_destructive: getting vardb.cpv_all")
422                                 if not self.installed_cpvs:
423                                         pkgset.update(self.vardb.cpv_all())
424                                 else:
425                                         pkgset.update(self.installed_cpvs)
426                                 self.output("   - processing %s installed ebuilds" % len(pkgset))
427                         elif package_names:
428                                 # list all CPV's from portree for CP's in vartree
429                                 #print( "_destructive: getting vardb.cp_all")
430                                 cps = self.vardb.cp_all()
431                                 self.output("   - processing %s installed packages" % len(cps))
432                                 for package in cps:
433                                         pkgset.update(self.portdb.cp_list(package))
434                 self.output("   - processing excluded")
435                 excludes = self._get_excludes(exclude)
436                 excludes_length = len(excludes)
437                 dprint("excludes", "EXCLUDES LENGTH =%d" %excludes_length)
438                 pkgset.update(excludes)
439                 pkgs_done = set(list(pkgs))
440                 pkgset.difference_update(pkgs_done)
441                 self.output(
442                         "   - (%d of %d total) additional excluded packages to get source filenames for"
443                         %(len(pkgset), excludes_length))
444                 #self.output("   - processing %d ebuilds for filenames" %len(pkgset))
445                 pkgs, _deprecated = self._unrestricted(pkgs, pkgset)
446                 deprecated.update(_deprecated)
447                 #self.output("   - done...")
448                 return pkgs, deprecated
449
450         def _get_excludes(self, exclude):
451                 """Expands the exclude dictionary into a set of
452                 CPV's
453
454                 @param exclude: dictionary of exclusion categories,
455                         packages to exclude from the cleaning
456
457                 @rtype: set
458                 @return set of package cpv's
459                 """
460                 pkgset = set()
461                 for cp in exclDictExpand(exclude):
462                         # add packages from the exclude file
463                         dprint("excludes", "_GET_EXCLUDES, cp=" + \
464                                 cp+", "+str(self.portdb.cp_list(cp)))
465                         pkgset.update(self.portdb.cp_list(cp))
466                 return pkgset
467
468         def _check_excludes(self, exclude, clean_me):
469                 """Performs a last minute check on remaining filenames
470                 to see if they should be protected.  Since if the pkg-version
471                 was deprecated it would not have been matched to a
472                 source filename and removed.
473
474                 @param exclude: an exclusion dictionary
475                 @param clean_me: the list of filenames for cleaning
476
477                 @rtype: dict of packages to clean
478                 """
479                 saved = {}
480                 pn_excludes = exclDictExpandPkgname(exclude)
481                 dprint("excludes", "_check_excludes: made it here ;)")
482                 if not pn_excludes:
483                         return clean_me, saved
484                 dprint("excludes", pn_excludes)
485                 for key in list(clean_me):
486                         if exclMatchFilename(pn_excludes, key):
487                                 saved[key] = clean_me[key]
488                                 del clean_me[key]
489                                 self.output("   ...Saved excluded package filename: " + key)
490                 return clean_me, saved
491
492
493 def findPackages(
494                 options,
495                 exclude=None,
496                 destructive=False,
497                 time_limit=0,
498                 package_names=False,
499                 pkgdir=None,
500                 port_dbapi=portage.db[portage.root]["porttree"].dbapi,
501                 var_dbapi=portage.db[portage.root]["vartree"].dbapi
502         ):
503         """Find all obsolete binary packages.
504
505         XXX: packages are found only by symlinks.
506         Maybe i should also return .tbz2 files from All/ that have
507         no corresponding symlinks.
508
509         @param options: dict of options determined at runtime
510         @param exclude: an exclusion dict as defined in
511                         exclude.parseExcludeFile class.
512         @param destructive: boolean, defaults to False
513         @param time_limit: integer time value as returned by parseTime()
514         @param package_names: boolean, defaults to False.
515                         used only if destructive=True
516         @param pkgdir: path to the binary package dir being checked
517         @param port_dbapi: defaults to portage.db[portage.root]["porttree"].dbapi
518                                         can be overridden for tests.
519         @param var_dbapi: defaults to portage.db[portage.root]["vartree"].dbapi
520                                         can be overridden for tests.
521
522         @rtype: dict
523         @return clean_me i.e. {'cat/pkg-ver.tbz2': [filepath],}
524         """
525         if exclude is None:
526                 exclude = {}
527         clean_me = {}
528         # create a full package dictionary
529
530         # now do an access test, os.walk does not error for "no read permission"
531         try:
532                 test = os.listdir(pkgdir)
533                 del test
534         except EnvironmentError as er:
535                 print( pp.error("Error accessing PKGDIR." ), file=sys.stderr)
536                 print( pp.error("(Check your /etc/make.conf and environment)."), file=sys.stderr)
537                 print( pp.error("Error: %s" %str(er)), file=sys.stderr)
538                 exit(1)
539         for root, dirs, files in os.walk(pkgdir):
540                 if root[-3:] == 'All':
541                         continue
542                 for file in files:
543                         if not file[-5:] == ".tbz2":
544                                 # ignore non-tbz2 files
545                                 continue
546                         path = os.path.join(root, file)
547                         category = os.path.split(root)[-1]
548                         cpv = category+"/"+file[:-5]
549                         st = os.lstat(path)
550                         if time_limit and (st[stat.ST_MTIME] >= time_limit):
551                                 # time-limit exclusion
552                                 continue
553                         # dict is cpv->[files] (2 files in general, because of symlink)
554                         clean_me[cpv] = [path]
555                         #if os.path.islink(path):
556                         if stat.S_ISLNK(st[stat.ST_MODE]):
557                                 clean_me[cpv].append(os.path.realpath(path))
558         # keep only obsolete ones
559         if destructive:
560                 dbapi = var_dbapi
561                 if package_names:
562                         cp_all = dict.fromkeys(dbapi.cp_all())
563                 else:
564                         cp_all = {}
565         else:
566                 dbapi = port_dbapi
567                 cp_all = {}
568         for cpv in list(clean_me):
569                 if exclDictMatchCP(exclude,portage.cpv_getkey(cpv)):
570                         # exclusion because of the exclude file
571                         del clean_me[cpv]
572                         continue
573                 if dbapi.cpv_exists(cpv):
574                         # exclusion because pkg still exists (in porttree or vartree)
575                         del clean_me[cpv]
576                         continue
577                 if portage.cpv_getkey(cpv) in cp_all:
578                         # exlusion because of --package-names
579                         del clean_me[cpv]
580
581         return clean_me