mirror of https://github.com/uriel1998/agaetr
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
311 lines
13 KiB
311 lines
13 KiB
#!/usr/bin/python3
|
|
|
|
import feedparser
|
|
import time
|
|
import string
|
|
from time import strftime,localtime
|
|
from subprocess import check_output
|
|
import sys
|
|
import json
|
|
from bs4 import BeautifulSoup
|
|
from pprint import pprint
|
|
import configparser
|
|
import os
|
|
from os.path import expanduser
|
|
from appdirs import *
|
|
from pathlib import Path
|
|
import shutil
|
|
import requests
|
|
import urllib.parse
|
|
|
|
########################################################################
|
|
# Defining configuration locations and such
|
|
########################################################################
|
|
|
|
appname = "agaetr"
|
|
appauthor = "Steven Saus"
|
|
#Where to store data, duh
|
|
datadir = user_data_dir(appname)
|
|
cachedir = user_cache_dir(appname)
|
|
configdir = user_config_dir(appname)
|
|
if not os.path.isdir(datadir):
|
|
os.makedirs(user_data_dir(appname))
|
|
#local cache
|
|
if not os.path.isdir(cachedir):
|
|
os.makedirs(user_cache_dir(appname))
|
|
#YUP
|
|
if not os.path.isdir(configdir):
|
|
os.makedirs(user_config_dir(appname))
|
|
ini = os.path.join(configdir,'agaetr.ini')
|
|
db = os.path.join(datadir,'posts.db')
|
|
posteddb = os.path.join(datadir,'posted.db')
|
|
tmp = os.path.join(cachedir,'posts.db')
|
|
|
|
Path(posteddb).touch()
|
|
Path(db).touch()
|
|
Path(tmp).touch()
|
|
|
|
########################################################################
|
|
# Have we already posted this? (our "db" is a flat file, btw)
|
|
# Added in check for the posted db
|
|
########################################################################
|
|
def post_is_in_db(title):
|
|
with open(db, 'r') as database:
|
|
for line in database:
|
|
if title in line:
|
|
return True
|
|
with open(posteddb, 'r') as database2:
|
|
for line1 in database2:
|
|
if title in line1:
|
|
return True
|
|
return False
|
|
|
|
########################################################################
|
|
# Parsing that feed!
|
|
########################################################################
|
|
def parse_that_feed(url,sensitive,CW,GCW):
|
|
|
|
feed = feedparser.parse(url)
|
|
|
|
for post in feed.entries:
|
|
|
|
# if post is already in the database, skip it
|
|
|
|
title = post.title.replace('\n', ' ').replace('\r', '').replace('<p>', '').replace('</p>', '').replace('|', ' ')
|
|
post.title = post.title.replace('\n', ' ').replace('\r', '').replace('<p>', '').replace('</p>', '').replace('|', ' ')
|
|
itemurl = post.link
|
|
# cleaning up descriptions and summaries. Boy, are they trash.
|
|
if hasattr(post, 'description'):
|
|
if "permalink" not in (str.lower(post.description)):
|
|
post_description = post.description
|
|
post_description = post_description.replace('\n', ' ').replace('\r', '').replace('<p>', '').replace('</p>', '').replace('|', ' ')
|
|
splitter = post_description.split()
|
|
post_description =" ".join(splitter)
|
|
post_description =BeautifulSoup(post_description, 'html.parser').text
|
|
else:
|
|
post_description = ""
|
|
else:
|
|
if hasattr(post, 'summary'):
|
|
if "permalink" not in (str.lower(post.summary)):
|
|
post_description = post.summary
|
|
post_description = post_description.replace('\n', ' ').replace('\r', '').replace('<p>', '').replace('</p>', '').replace('|', ' ')
|
|
splitter = post_description.split()
|
|
post_description =" ".join(splitter)
|
|
post_description =BeautifulSoup(post_description, 'html.parser').text
|
|
else:
|
|
post_description = ""
|
|
|
|
# While this avoids errors from the TT-RSS feed, it provides a bad date
|
|
# And since the python module pulls in the feed directly, hence the need
|
|
# for our preprocessor. (And probably also a quick way to see if it's
|
|
# been updated, too.)
|
|
date_published = localtime()
|
|
thetime=time.strftime("%Y%m%d%H%M%S",localtime())
|
|
|
|
|
|
if hasattr(post, 'published_parsed'):
|
|
date_parsed = post.published_parsed
|
|
thetime=time.strftime("%Y%m%d%H%M%S",date_parsed)
|
|
if hasattr(post, 'post.published'):
|
|
date_published = post.published
|
|
thetime=time.strftime("%Y%m%d%H%M%S",date_published)
|
|
if hasattr(post, 'post.updated'):
|
|
date_published = post.updated
|
|
|
|
if not post_is_in_db(title):
|
|
f = open(db, 'a')
|
|
tags = []
|
|
hashtags = []
|
|
|
|
if hasattr(post, 'tags'):
|
|
i = 0
|
|
|
|
while i < len(post.tags):
|
|
if "uncategorized" not in (str.lower(post.tags[i]['term'])):
|
|
if "onetime" not in (str.lower(post.tags[i]['term'])):
|
|
if "overnight" not in (str.lower(post.tags[i]['term'])):
|
|
if "post" not in (str.lower(post.tags[i]['term'])):
|
|
post.tags[i]['term'] = post.tags[i]['term'].replace(':',' ').replace('|', ' ').replace('/',' ').replace('\\',' ').replace(' ',' ').replace(' ','-')
|
|
hashtags.append('#%s' % str.lower(post.tags[i]['term']))
|
|
i += 1
|
|
|
|
if GCW:
|
|
tags.append('%s' % str.lower(GCW))
|
|
i = 0
|
|
while i < len(post.tags):
|
|
if "uncategorized" not in (str.lower(post.tags[i]['term'])):
|
|
if "onetime" not in (str.lower(post.tags[i]['term'])):
|
|
if "overnight" not in (str.lower(post.tags[i]['term'])):
|
|
if "post" not in (str.lower(post.tags[i]['term'])):
|
|
if (str.lower(post.tags[i]['term'])) not in tags:
|
|
post.tags[i]['term'] = post.tags[i]['term'].replace('|', ' ').replace('/',' ').replace('\\',' ')
|
|
tags.append('%s' % str.lower(post.tags[i]['term']))
|
|
i += 1
|
|
|
|
#Do we always have CW on this feed?
|
|
if CW == "no":
|
|
cwmarker = 0
|
|
ContentWarningString = ""
|
|
else:
|
|
cwmarker = 1
|
|
ContentWarningString = feed_GlobalCW
|
|
|
|
|
|
for x in sections:
|
|
if "cw" in (str.lower(x)):
|
|
ContentWarningList = str.lower(config['DEFAULT']['filters'])
|
|
keyword=config[x]['keyword']
|
|
ContentWarningList = ContentWarningList + str.lower(config[x]['matches'])
|
|
|
|
|
|
if hasattr(post, 'tags'):
|
|
for d in tags:
|
|
if d in ContentWarningList.split():
|
|
cwmarker += 1
|
|
ContentWarningString = ContentWarningString + " " + keyword
|
|
# double checking with title as well
|
|
bob = str.lower((', '.join(tags)) + ' ' + post.title)
|
|
for d in ContentWarningList.split():
|
|
if d in bob.split():
|
|
cwmarker += 1
|
|
ContentWarningString = ContentWarningString + " " + keyword
|
|
|
|
imgalt=None
|
|
imgurl=None
|
|
# Look for image in media content first
|
|
if 'media_content' in post:
|
|
|
|
# Trying the sleep function here in case there's flood protection on
|
|
# the server we're checking images from
|
|
|
|
time.sleep(2)
|
|
mediaContent=post.media_content
|
|
for item in post.media_content:
|
|
# making sure it's not flash/video from Youtube/Vimeo
|
|
if 'type' in item:
|
|
if "flash" in (item['type']):
|
|
#print(item['type'])
|
|
if 'media_thumbnail' in post:
|
|
mediaContent=post.media_thumbnail
|
|
for item in post.media_thumbnail:
|
|
amgurl = item['url'].split('?')
|
|
|
|
if amgurl[0].endswith("jpg"):
|
|
r = requests.head(amgurl[0], timeout=10)
|
|
if (int(r.status_code) == 200):
|
|
imgurl = amgurl[0]
|
|
else:
|
|
imgurl = item['url']
|
|
imgalt = post.title
|
|
break
|
|
amgurl = item['url'].split('?')
|
|
if amgurl[0].endswith("jpg"):
|
|
r = requests.head(amgurl[0], timeout=10)
|
|
if (int(r.status_code) == 200):
|
|
imgurl = amgurl[0]
|
|
else:
|
|
imgurl = item['url']
|
|
imgalt = post.title
|
|
break
|
|
else:
|
|
# Finding image in the html
|
|
if 'content' in post:
|
|
soup = BeautifulSoup((post.content[0]['value']), 'html.parser')
|
|
imgtag = soup.find("img")
|
|
#print(imgtag)
|
|
else:
|
|
soup = BeautifulSoup(urllib.parse.unquote(post.description), 'html.parser')
|
|
imgtag = soup.find("img")
|
|
#print(imgtag)
|
|
# checking for tracking images
|
|
if soup.find("img"):
|
|
if imgtag.has_attr('width'):
|
|
if (int(imgtag['width']) > 2):
|
|
imgurl = imgtag['src']
|
|
# seeing if there's an alt title for accessibility
|
|
if imgtag.has_attr('alt'):
|
|
imgalt = imgtag['alt']
|
|
else:
|
|
if imgtag.has_attr('title'):
|
|
imgalt = imgtag['title']
|
|
else:
|
|
imgalt = None
|
|
#checking for empty strings
|
|
imgalt = imgalt.strip()
|
|
if not imgalt:
|
|
imgalt = post.title
|
|
else:
|
|
imgurl = imgtag['src']
|
|
# seeing if there's an alt title for accessibility
|
|
if imgtag.has_attr('alt'):
|
|
imgalt = imgtag['alt']
|
|
else:
|
|
if imgtag.has_attr('title'):
|
|
imgalt = imgtag['title']
|
|
else:
|
|
imgalt = None
|
|
#checking for empty strings
|
|
if not imgalt:
|
|
imgalt = post.title
|
|
else:
|
|
imgalt = imgalt.strip()
|
|
print("# Adding " + post.title)
|
|
|
|
if cwmarker > 0:
|
|
words = ContentWarningString.split()
|
|
ContentWarningString = (",".join(sorted(set(words), key=words.index)))
|
|
HashtagsString = str.lower(' '.join(hashtags))
|
|
words2 = HashtagsString.split()
|
|
HashtagsString = (" ".join(sorted(set(words2), key=words2.index)))
|
|
f.write(thetime + "|" + post.title + "|" + post.link + "|" + str.lower(ContentWarningString) + "|" + str(imgalt) + "|" + str(imgurl) + "|" + HashtagsString + "|" + str(post_description) + "\n")
|
|
else:
|
|
HashtagsString = str.lower(' '.join(hashtags))
|
|
words2 = HashtagsString.split()
|
|
HashtagsString = (" ".join(sorted(set(words2), key=words2.index)))
|
|
f.write(thetime + "|" + post.title + "|" + post.link + "|" + "|" + str(imgalt) + "|" + str(imgurl) + "|" + HashtagsString + "|" + str(post_description) + "\n")
|
|
|
|
f.close
|
|
else:
|
|
print("## Already have " + post.title)
|
|
return
|
|
|
|
|
|
########################################################################
|
|
# Read ini section
|
|
########################################################################
|
|
|
|
config = configparser.ConfigParser()
|
|
config.read(ini)
|
|
sections=config.sections()
|
|
|
|
########################################################################
|
|
# Begin loop over feedlist
|
|
########################################################################
|
|
|
|
ContentWarningList = config['DEFAULT']['filters']
|
|
ContentWarningString = str.lower(config['DEFAULT']['GlobalCW'])
|
|
for x in sections:
|
|
if "feed" in (str.lower(x)):
|
|
feed=config[x]['url']
|
|
feed_sensitive=config[x]['sensitive']
|
|
if 'y' in config['DEFAULT']['ContentWarning']:
|
|
feed_CW=config['DEFAULT']['ContentWarning']
|
|
feed_GlobalCW=config[x]['GlobalCW'] + " " + str.lower(config['DEFAULT']['GlobalCW'])
|
|
else:
|
|
feed_CW=config[x]['ContentWarning']
|
|
feed_GlobalCW=config[x]['GlobalCW']
|
|
|
|
parse_that_feed(feed,feed_sensitive,feed_CW,feed_GlobalCW)
|
|
|
|
shutil.copyfile(db,tmp)
|
|
|
|
infile = open(tmp,'r')
|
|
lines = infile.readlines()
|
|
infile.close
|
|
out = open(db, 'w')
|
|
for line in sorted(lines, key=lambda line: line.split()[0]):
|
|
out.write(line)
|
|
out.close
|
|
os.remove(tmp)
|
|
|
|
exit() |