chiark / gitweb /
Switch all headers to python3
[fdroidserver.git] / fdroidserver / stats.py
index 6aeb61297b760a37ac01f5f1284334aa29ecf0cb..351390fbf77bbc4dca6fc8854d655efd825829f1 100644 (file)
@@ -1,5 +1,4 @@
-#!/usr/bin/env python2
-# -*- coding: utf-8 -*-
+#!/usr/bin/env python3
 #
 # stats.py - part of the FDroid server tools
 # Copyright (C) 2010-13, Ciaran Gultnieks, ciaran@ciarang.com
@@ -23,13 +22,16 @@ import re
 import time
 import traceback
 import glob
-from optparse import OptionParser
+import json
+from argparse import ArgumentParser
 import paramiko
 import socket
 import logging
+import common
+import metadata
+import subprocess
+from collections import Counter
 
-import common, metadata
-from common import FDroidPopen
 
 def carbon_send(key, value):
     s = socket.socket()
@@ -41,28 +43,39 @@ def carbon_send(key, value):
 options = None
 config = None
 
+
+def most_common_stable(counts):
+    pairs = []
+    for s in counts:
+        pairs.append((s, counts[s]))
+    return sorted(pairs, key=lambda t: (-t[1], t[0]))
+
+
 def main():
 
     global options, config
 
     # Parse command line...
-    parser = OptionParser()
-    parser.add_option("-v", "--verbose", action="store_true", default=False,
-                      help="Spew out even more information than normal")
-    parser.add_option("-d", "--download", action="store_true", default=False,
-                      help="Download logs we don't have")
-    parser.add_option("--nologs", action="store_true", default=False,
-                      help="Don't do anything logs-related")
-    (options, args) = parser.parse_args()
+    parser = ArgumentParser()
+    common.setup_global_opts(parser)
+    parser.add_argument("-d", "--download", action="store_true", default=False,
+                        help="Download logs we don't have")
+    parser.add_argument("--recalc", action="store_true", default=False,
+                        help="Recalculate aggregate stats - use when changes "
+                        "have been made that would invalidate old cached data.")
+    parser.add_argument("--nologs", action="store_true", default=False,
+                        help="Don't do anything logs-related")
+    options = parser.parse_args()
 
     config = common.read_config(options)
 
     if not config['update_stats']:
-        logging.info("Stats are disabled - check your configuration")
+        logging.info("Stats are disabled - set \"update_stats = True\" in your config.py")
         sys.exit(1)
 
     # Get all metadata-defined apps...
-    metaapps = metadata.read_metadata(options.verbose)
+    allmetaapps = [app for app in metadata.read_metadata().itervalues()]
+    metaapps = [app for app in allmetaapps if not app.Disabled]
 
     statsdir = 'stats'
     logsdir = os.path.join(statsdir, 'logs')
@@ -82,8 +95,8 @@ def main():
             logging.info('Retrieving logs')
             ssh = paramiko.SSHClient()
             ssh.load_system_host_keys()
-            ssh.connect('f-droid.org', username='fdroid', timeout=10,
-                    key_filename=config['webserver_keyfile'])
+            ssh.connect(config['stats_server'], username=config['stats_user'],
+                        timeout=10, key_filename=config['webserver_keyfile'])
             ftp = ssh.open_sftp()
             ftp.get_channel().settimeout(60)
             logging.info("...connected")
@@ -97,13 +110,13 @@ def main():
                     destsize = ftp.stat(f).st_size
                     if (not os.path.exists(destpath) or
                             os.path.getsize(destpath) != destsize):
-                        logging.info("...retrieving " + f)
+                        logging.debug("...retrieving " + f)
                         ftp.get(f, destpath)
         except Exception:
             traceback.print_exc()
             sys.exit(1)
         finally:
-            #Disconnect
+            # Disconnect
             if ftp is not None:
                 ftp.close()
             if ssh is not None:
@@ -115,16 +128,42 @@ def main():
     if not options.nologs:
         # Process logs
         logging.info('Processing logs...')
-        apps = {}
-        appsVer = {}
-        logexpr = '(?P<ip>[.:0-9a-fA-F]+) - - \[(?P<time>.*?)\] "GET (?P<uri>.*?) HTTP/1.\d" (?P<statuscode>\d+) \d+ "(?P<referral>.*?)" "(?P<useragent>.*?)"'
+        appscount = Counter()
+        appsvercount = Counter()
+        logexpr = '(?P<ip>[.:0-9a-fA-F]+) - - \[(?P<time>.*?)\] ' + \
+            '"GET (?P<uri>.*?) HTTP/1.\d" (?P<statuscode>\d+) ' + \
+            '\d+ "(?P<referral>.*?)" "(?P<useragent>.*?)"'
         logsearch = re.compile(logexpr).search
-        for logfile in glob.glob(os.path.join(logsdir,'access-*.log.gz')):
-            logging.info('...' + logfile)
-            p = FDroidPopen(["zcat", logfile])
-            matches = (logsearch(line) for line in p.stdout)
-            for match in matches:
-                if match and match.group('statuscode') == '200':
+        for logfile in glob.glob(os.path.join(logsdir, 'access-*.log.gz')):
+            logging.debug('...' + logfile)
+
+            # Get the date for this log - e.g. 2012-02-28
+            thisdate = os.path.basename(logfile)[7:-7]
+
+            agg_path = os.path.join(datadir, thisdate + '.json')
+            if not options.recalc and os.path.exists(agg_path):
+                # Use previously calculated aggregate data
+                with open(agg_path, 'r') as f:
+                    today = json.load(f)
+
+            else:
+                # Calculate from logs...
+
+                today = {
+                    'apps': Counter(),
+                    'appsver': Counter(),
+                    'unknown': []
+                }
+
+                p = subprocess.Popen(["zcat", logfile], stdout=subprocess.PIPE)
+                matches = (logsearch(line) for line in p.stdout)
+                for match in matches:
+                    if not match:
+                        continue
+                    if match.group('statuscode') != '200':
+                        continue
+                    if match.group('ip') in config['stats_ignore']:
+                        continue
                     uri = match.group('uri')
                     if not uri.endswith('.apk'):
                         continue
@@ -132,131 +171,123 @@ def main():
                     app = knownapks.getapp(apkname)
                     if app:
                         appid, _ = app
-                        if appid in apps:
-                            apps[appid] += 1
-                        else:
-                            apps[appid] = 1
+                        today['apps'][appid] += 1
                         # Strip the '.apk' from apkname
-                        appVer = apkname[:-4]
-                        if appVer in appsVer:
-                            appsVer[appVer] += 1
-                        else:
-                            appsVer[appVer] = 1
+                        appver = apkname[:-4]
+                        today['appsver'][appver] += 1
                     else:
-                        if not apkname in unknownapks:
-                            unknownapks.append(apkname)
+                        if apkname not in today['unknown']:
+                            today['unknown'].append(apkname)
+
+                # Save calculated aggregate data for today to cache
+                with open(agg_path, 'w') as f:
+                    json.dump(today, f)
+
+            # Add today's stats (whether cached or recalculated) to the total
+            for appid in today['apps']:
+                appscount[appid] += today['apps'][appid]
+            for appid in today['appsver']:
+                appsvercount[appid] += today['appsver'][appid]
+            for uk in today['unknown']:
+                if uk not in unknownapks:
+                    unknownapks.append(uk)
 
         # Calculate and write stats for total downloads...
         lst = []
         alldownloads = 0
-        for app, count in apps.iteritems():
-            lst.append(app + " " + str(count))
+        for appid in appscount:
+            count = appscount[appid]
+            lst.append(appid + " " + str(count))
             if config['stats_to_carbon']:
-                carbon_send('fdroid.download.' + app.replace('.', '_'), count)
+                carbon_send('fdroid.download.' + appid.replace('.', '_'),
+                            count)
             alldownloads += count
         lst.append("ALL " + str(alldownloads))
-        f = open('stats/total_downloads_app.txt', 'w')
-        f.write('# Total downloads by application, since October 2011\n')
-        for line in sorted(lst):
-            f.write(line + '\n')
-        f.close()
-
-        f = open('stats/total_downloads_app_version.txt', 'w')
-        f.write('# Total downloads by application and version, since October 2011\n')
+        with open(os.path.join(statsdir, 'total_downloads_app.txt'), 'w') as f:
+            f.write('# Total downloads by application, since October 2011\n')
+            for line in sorted(lst):
+                f.write(line + '\n')
+
         lst = []
-        for appver, count in appsVer.iteritems():
+        for appver in appsvercount:
+            count = appsvercount[appver]
             lst.append(appver + " " + str(count))
-        for line in sorted(lst):
-            f.write(line + "\n")
-        f.close()
+
+        with open(os.path.join(statsdir, 'total_downloads_app_version.txt'), 'w') as f:
+            f.write('# Total downloads by application and version, '
+                    'since October 2011\n')
+            for line in sorted(lst):
+                f.write(line + "\n")
 
     # Calculate and write stats for repo types...
     logging.info("Processing repo types...")
-    repotypes = {}
+    repotypes = Counter()
     for app in metaapps:
-        if len(app['Repo Type']) == 0:
-            rtype = 'none'
-        else:
-            if app['Repo Type'] == 'srclib':
-                rtype = common.getsrclibvcs(app['Repo'])
-            else:
-                rtype = app['Repo Type']
-        if rtype in repotypes:
-            repotypes[rtype] += 1;
-        else:
-            repotypes[rtype] = 1
-    f = open('stats/repotypes.txt', 'w')
-    for rtype, count in repotypes.iteritems():
-        f.write(rtype + ' ' + str(count) + '\n')
-    f.close()
+        rtype = app.RepoType or 'none'
+        if rtype == 'srclib':
+            rtype = common.getsrclibvcs(app.Repo)
+        repotypes[rtype] += 1
+    with open(os.path.join(statsdir, 'repotypes.txt'), 'w') as f:
+        for rtype, count in most_common_stable(repotypes):
+            f.write(rtype + ' ' + str(count) + '\n')
 
     # Calculate and write stats for update check modes...
     logging.info("Processing update check modes...")
-    ucms = {}
+    ucms = Counter()
     for app in metaapps:
-        checkmode = app['Update Check Mode'].split('/')[0]
-        if checkmode in ucms:
-            ucms[checkmode] += 1;
-        else:
-            ucms[checkmode] = 1
-    f = open('stats/update_check_modes.txt', 'w')
-    for checkmode, count in ucms.iteritems():
-        f.write(checkmode + ' ' + str(count) + '\n')
-    f.close()
+        checkmode = app.UpdateCheckMode
+        if checkmode.startswith('RepoManifest/'):
+            checkmode = checkmode[:12]
+        if checkmode.startswith('Tags '):
+            checkmode = checkmode[:4]
+        ucms[checkmode] += 1
+    with open(os.path.join(statsdir, 'update_check_modes.txt'), 'w') as f:
+        for checkmode, count in most_common_stable(ucms):
+            f.write(checkmode + ' ' + str(count) + '\n')
 
     logging.info("Processing categories...")
-    ctgs = {}
+    ctgs = Counter()
     for app in metaapps:
-        if app['Categories'] is None:
-            continue
-        categories = [c.strip() for c in app['Categories'].split(',')]
-        for category in categories:
-            if category in ctgs:
-                ctgs[category] += 1;
-            else:
-                ctgs[category] = 1
-    f = open('stats/categories.txt', 'w')
-    for category, count in ctgs.iteritems():
-        f.write(category + ' ' + str(count) + '\n')
-    f.close()
+        for category in app.Categories:
+            ctgs[category] += 1
+    with open(os.path.join(statsdir, 'categories.txt'), 'w') as f:
+        for category, count in most_common_stable(ctgs):
+            f.write(category + ' ' + str(count) + '\n')
 
     logging.info("Processing antifeatures...")
-    afs = {}
+    afs = Counter()
     for app in metaapps:
-        if app['AntiFeatures'] is None:
+        if app.AntiFeatures is None:
             continue
-        antifeatures = [a.strip() for a in app['AntiFeatures'].split(',')]
-        for antifeature in antifeatures:
-            if antifeature in afs:
-                afs[antifeature] += 1;
-            else:
-                afs[antifeature] = 1
-    f = open('stats/antifeatures.txt', 'w')
-    for antifeature, count in afs.iteritems():
-        f.write(antifeature + ' ' + str(count) + '\n')
-    f.close()
+        for antifeature in app.AntiFeatures:
+            afs[antifeature] += 1
+    with open(os.path.join(statsdir, 'antifeatures.txt'), 'w') as f:
+        for antifeature, count in most_common_stable(afs):
+            f.write(antifeature + ' ' + str(count) + '\n')
 
     # Calculate and write stats for licenses...
     logging.info("Processing licenses...")
-    licenses = {}
+    licenses = Counter()
     for app in metaapps:
-        license = app['License']
-        if license in licenses:
-            licenses[license] += 1;
-        else:
-            licenses[license] = 1
-    f = open('stats/licenses.txt', 'w')
-    for license, count in licenses.iteritems():
-        f.write(license + ' ' + str(count) + '\n')
-    f.close()
+        license = app.License
+        licenses[license] += 1
+    with open(os.path.join(statsdir, 'licenses.txt'), 'w') as f:
+        for license, count in most_common_stable(licenses):
+            f.write(license + ' ' + str(count) + '\n')
+
+    # Write list of disabled apps...
+    logging.info("Processing disabled apps...")
+    disabled = [app.id for app in allmetaapps if app.Disabled]
+    with open(os.path.join(statsdir, 'disabled_apps.txt'), 'w') as f:
+        for appid in sorted(disabled):
+            f.write(appid + '\n')
 
     # Write list of latest apps added to the repo...
     logging.info("Processing latest apps...")
     latest = knownapks.getlatest(10)
-    f = open('stats/latestapps.txt', 'w')
-    for app in latest:
-        f.write(app + '\n')
-    f.close()
+    with open(os.path.join(statsdir, 'latestapps.txt'), 'w') as f:
+        for appid in latest:
+            f.write(appid + '\n')
 
     if unknownapks:
         logging.info('\nUnknown apks:')
@@ -267,4 +298,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-