aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorErik Liodden <eriklio@stud.ntnu.no>2017-09-21 14:57:35 +0200
committerErik Liodden <eriklio@stud.ntnu.no>2017-09-21 14:57:35 +0200
commit735a5f0df0b810b22d38b8c0566eb02f1a23060b (patch)
tree87006d8ae41eaea7cba23968f40ac76a9ab6148e
parent6ada2024dc6f241b3b788266bcd9be6d4970b965 (diff)
downloadboil-me-baby-735a5f0df0b810b22d38b8c0566eb02f1a23060b.tar.gz
add main program
the mail program to fetch pdfs. run with: python boil_me_baby.py --help there are still a few things that can be improved, mainly the regex collecting pdfs and course urls. however, for an early version it works pretty good.
-rw-r--r--boil_me_baby.py167
1 files changed, 167 insertions, 0 deletions
diff --git a/boil_me_baby.py b/boil_me_baby.py
new file mode 100644
index 0000000..d8a4452
--- /dev/null
+++ b/boil_me_baby.py
@@ -0,0 +1,167 @@
+import argparse
+from sys import argv, exit
+
+parser = argparse.ArgumentParser()
+parser = argparse.ArgumentParser(prog='lf-fetch', usage='%(prog)s [options]',
+ description='ls-parser searches and fetches .pdfs from math courses.')
+parser.add_argument('-s', '--semester', default="all",
+ help='which semester. eg \'2017h\' or \'2015v\' or \'all\'')
+parser.add_argument('--lf-only', action='store_true', default=False,
+ help='only fetch lfs')
+parser.add_argument('--list-courses', action='store_true', default=False,
+ help='list courses and semesters')
+parser.add_argument('--dry-run', action='store_true', default=False,
+ help='don\'t download anything')
+
+if (len(argv) == 1):
+ parser.print_help()
+ exit(0)
+
+args = parser.parse_args()
+
+import requests
+import re
+import os
+from subprocess import call
+import pickle
+
+def find_pdfs(html):
+ ''' find .pdf files on site. Can be improved '''
+ pdfs = []
+ m = re.findall(r'href="\s*(\/\_media.*?\.pdf)\s*"', html)
+ m += re.findall(r'title="\s*(.*?\.pdf)\s*"', html)
+
+ '''
+ if len(m) < 5:
+ m = re.findall(r'title="\s*(.*?\.pdf)\s*"', html)
+ '''
+ for link in m:
+ if link.startswith('/'):
+ pdfs.append("https://wiki.math.ntnu.no" + link)
+ else:
+ pdfs.append(link)
+
+ return(pdfs)
+
+def find_pdfs2(html):
+ ''' find .pdf files on site. not in use '''
+ pdfs = []
+ m = re.findall(r'(["\'])(\\?.)*?\.pdf\1', html)
+
+
+ for link in m:
+ if link.startswith('/'):
+ pdfs.append("https://wiki.math.ntnu.no" + link)
+ else:
+ pdfs.append(link)
+
+ for pdf in pdfs:
+ print(pdf)
+ return(pdfs)
+
+def fetch_site(url):
+ ''' get html code as a string '''
+ r = requests.get(url)
+ return(r.text)
+
+def get_course_codes(courses):
+ html = fetch_site("https://wiki.math.ntnu.no/emner")
+ # TODO: add ST courses
+ m = re.findall(r'title="\s*([ma]|[tma].*?\d{4})\s*"', html)
+ m += re.findall(r'title="\s*([st].*?\d{4})\s*"', html)
+
+ for course in m:
+ courses[course] = {}
+ print("found {} courses".format(len(m)))
+
+ return(len(m))
+
+def get_course_years(courses):
+ ''' get the years listed on the course page '''
+ for code in courses.keys():
+ html = fetch_site("https://wiki.math.ntnu.no/" + code)
+ html2 = fetch_site("https://www.math.ntnu.no/emner/" + code)
+ m = re.findall(f'title="\s*([{code}].*?)\s*"', html)
+ m += re.findall(f'title="\s*([{code}].*?)\s*"', html2)
+
+ for site in m:
+ try:
+ year = site.split(':')[1]
+ if year not in courses[code].keys() and (year == args.semester or
+ args.semester == "all"):
+ courses[code][year] = []
+ except:
+ continue
+
+def get_site_with_exercises(courses):
+ call(["mkdir", "-p", "out"])
+ a = "https://wiki.math.ntnu.no/"
+ b = "https://www.math.ntnu.no/emner/"
+ sites = [
+ "/ov",
+ "/exer",
+ "/exerlist",
+ "/oving",
+ "/ovinger",
+ "/start",
+ "/exercises",
+ "/fremdriftsplan",
+ "/files",
+ "/LF"
+ ]
+ for code in courses.keys():
+ for year in courses[code].keys():
+ if (year != "all" and year != args.semester):
+ continue
+ for sub in sites:
+ html = fetch_site(a + code + '/' + year + sub)
+ if html != None:
+ courses[code][year] += find_pdfs(html)
+
+ html = fetch_site(b + code + '/' + year + sub)
+ if html != None:
+ courses[code][year] += find_pdfs(html)
+
+ for pdf in courses[code][year]:
+ if (args.lf_only):
+ if ("forslag" in pdf or "lf" in pdf or "sol" in pdf or
+ "answer" in pdf or "losn" in pdf):
+ call(["wget", "-nc", "-q", "--show-progress", "-P",
+ "out/" + code + '/' + year, pdf])
+ else:
+ call(["wget", "-nc", "-q", "--show-progress", "-P",
+ "out/" + code + '/' + year, pdf])
+
+def list_courses(courses):
+ if (args.semester != "all"):
+ print("year: {}\n===========".format(args.semester))
+ for code in courses:
+ if (args.semester == "all"):
+ print(code + ':', end='')
+ for y in courses[code]:
+ print(' ' + y, end='')
+ print('')
+ elif (args.semester in courses[code]):
+ print(code)
+
+def main():
+
+ try:
+ courses = pickle.load(open("courses.p", "rb" ))
+ except:
+ courses = {}
+ get_course_codes(courses)
+ get_course_years(courses)
+
+ if (args.list_courses):
+ list_courses(courses)
+
+ if (args.dry_run == False):
+ get_site_with_exercises(courses)
+ pickle.dump(courses, open("courses.p", "wb"))
+
+ return 0
+
+if __name__ == '__main__':
+ main()
+