Improve UTF-8 handling, multiple media links, url expanders

author Matthew Vernon <matthewv@chiark.greenend.org.uk>

Tue, 14 Jun 2016 18:43:52 +0000 (19:43 +0100)

committer Matthew Vernon <matthewv@chiark.greenend.org.uk>

Tue, 14 Jun 2016 18:43:52 +0000 (19:43 +0100)
author Matthew Vernon <matthewv@chiark.greenend.org.uk>
Tue, 14 Jun 2016 18:43:52 +0000 (19:43 +0100)
committer Matthew Vernon <matthewv@chiark.greenend.org.uk>
Tue, 14 Jun 2016 18:43:52 +0000 (19:43 +0100)
diff --git a/commands.py b/commands.py

index ce6b199589c628117aa137811de023606036d929..8e3bfe6f336272374a82b0f67df0b81924dfebf2 100755 (executable)
--- a/commands.py
+++ b/commands.py
@@ -1,5 +1,6 @@
  # Part of Acrobat.
  import string, cPickle, random, urllib, sys, time, re, os, twitter, subprocess, datetime, urlparse
+from collections import defaultdict
  from irclib import irc_lower, nm_to_n
  
  # query karma
@@ -501,6 +502,7 @@ def twitterq(bot,cmd,nick,conn,public,twitapi):
      bot.automsg(public, nick, stringout)
    
  def getTweet(urlstring,twitapi):
+  unobfuscate_urls=True
    parts = string.split(urlstring,'/')
    tweetID = parts[-1]
    try:
@@ -509,38 +511,77 @@ def getTweet(urlstring,twitapi):
          return "twitapi.GetStatus returned nothing :-("
      if status.user == None and status.text == None:
          return "Empty status object returned :("
+    if status.retweeted_status and status.retweeted_status.text:
+        status = status.retweeted_status
      if status.user is not None:
-        tweeter_screen = status.user.screen_name.encode('UTF-8', 'replace')
-        tweeter_name = status.user.name.encode('UTF-8', 'replace')
+        tweeter_screen = status.user.screen_name #.encode('UTF-8', 'replace')
+        tweeter_name = status.user.name #.encode('UTF-8', 'replace')
      else:
          tweeter_screen = "[not returned]" ; tweeter_name = "[not returned]"
-    tweetText = status.text.encode('UTF-8', 'replace')
-    tweetText = tweetText.replace('\n',' ')
-
-    for medium in status.media:
-        if "media_url_https" in medium:
-            link = medium["media_url_https"]
-            link = re.sub(r"/tweet_video_thumb/(\w+).jpg", r"/tweet_video/\1.mp4", link)
-            tweetText = tweetText.replace(medium["url"], link)
+        tweeter_name = tweeter_name + " RTing " + status.user.name #.encode('UTF-8', 'replace')
+    tweetText = status.text
+    if status.media:
+        replacements = defaultdict( list )
+        for medium in status.media:
+            replacements[medium.url].append(medium.media_url_https)
+
+        for k,v in replacements.items():
+
+            v = [re.sub(r"/tweet_video_thumb/(\w+).jpg", r"/tweet_video/\1.mp4", link) for link in v]
+            if len(v) > 1:
+                replacementstring = "[" +  " ; ".join(v) +"]"
+            else:
+                replacementstring = v[0]
+            tweetText = tweetText.replace(k, replacementstring)
+
      for url in status.urls:
          toReplace = url.expanded_url
  
+        if unobfuscate_urls:
+            import urllib
+            rv = urlparse.urlparse(toReplace)
+            if rv.hostname in {
+                # sourced from http://bit.do/list-of-url-shorteners.php
+                "bit.do", "t.co", "lnkd.in", "db.tt", "qr.ae", "adf.ly",
+                "goo.gl", "bitly.com", "cur.lv", "tinyurl.com", "ow.ly",
+                "bit.ly", "adcrun.ch", "ity.im", "q.gs", "viralurl.com",
+                "is.gd", "po.st", "vur.me", "bc.vc", "twitthis.com", "u.to",
+                "j.mp", "buzurl.com", "cutt.us", "u.bb", "yourls.org",
+                "crisco.com", "x.co", "prettylinkpro.com", "viralurl.biz",
+                "adcraft.co", "virl.ws", "scrnch.me", "filoops.info", "vurl.bz",
+                "vzturl.com", "lemde.fr", "qr.net", "1url.com", "tweez.me",
+                "7vd.cn", "v.gd", "dft.ba", "aka.gr", "tr.im",
+                 # added by ASB:
+                 "trib.al", "dlvr.it"
+                               }:
+                #expand list as needed.
+                response = urllib.urlopen('http://urlex.org/txt/' + toReplace)
+                resptext = response.read()
+                if resptext.startswith('http'): # ie it looks urlish (http or https)
+                    if resptext != toReplace:
+                        toReplace = resptext
+                    # maybe make a note of the domain of the original URL to compile list of shortenable domains?
+
          # remove tracking utm_ query parameters, for privacy and brevity
          # code snippet from https://gist.github.com/lepture/5997883
          rv = urlparse.urlparse(toReplace)
          if rv.query:
              query = re.sub(r'utm_\w+=[^&]+&?', '', rv.query)
-            toReplace = '%s://%s%s?%s' % (rv.scheme, rv.hostname, rv.path, query)
+            if query:
+                toReplace = '%s://%s%s?%s' % (rv.scheme, rv.hostname, rv.path, query)
+            else:
+                toReplace = '%s://%s%s' % (rv.scheme, rv.hostname, rv.path) # leave off the final '?'
  
          tweetText = tweetText.replace(url.url, toReplace)
  
      tweetText = tweetText.replace("&gt;",">")
      tweetText = tweetText.replace("&lt;","<")
      tweetText = tweetText.replace("&amp;","&")
-    tweetText = tweetText.encode('UTF-8', 'replace')
-
      stringout = "tweet by %s (%s): %s" %(tweeter_screen,tweeter_name,tweetText)
    except twitter.TwitterError:
      terror = sys.exc_info()
      stringout = "Twitter error: %s" % terror[1].__str__()
-  return stringout
+  except Exception:
+    terror = sys.exc_info()
+    stringout = "Error: %s" % terror[1].__str__()
+  return stringout.encode('UTF-8', 'replace')
author	Matthew Vernon <matthewv@chiark.greenend.org.uk>
	Tue, 14 Jun 2016 18:43:52 +0000 (19:43 +0100)
committer	Matthew Vernon <matthewv@chiark.greenend.org.uk>
	Tue, 14 Jun 2016 18:43:52 +0000 (19:43 +0100)