import requests from bs4 import BeautifulSoup import os # mainURL = input('Please enter the URL of eGyanKosh page\n: ') baseURL = 'https://egyankosh.ac.in' mainURL = 'https://egyankosh.ac.in/handle/123456789/404' print(mainURL) def getDataFromURL(url): r = requests.get(url, verify=False) if r.status_code != 200: assert False return r.text def stripEmptySpace(html): html = str(html) html = "".join(line.strip() for line in html.split("\n")) return BeautifulSoup(html, 'html.parser') def getPageInfo(html): soup = BeautifulSoup(html, 'html.parser') pageTitle = soup.h2.text.split('\n')[0] pageType = soup.select(".col-md-8 small")[0].string return {'pageTitle': pageTitle, 'pageType': pageType} def getSubPages(html, baseUrl = baseURL): soup = BeautifulSoup(html, 'html.parser') table = soup.select('.col-md-9 .list-group')[0] rows = table.select(".list-group-item.row") subpages = [] for row in rows: row = stripEmptySpace(row) info = { 'name': row.string, 'url': baseURL + row.a.get('href') } subpages.append(info) return subpages def getDownloadPageLinks(html, baseURL = baseURL): soup = BeautifulSoup(html, 'html.parser') anchors = soup.table.findAll('a') links = [] for anchor in anchors: link = { 'name': anchor.string, 'url': baseURL + anchor.get('href') } links.append(link) return links def downloadPdf(name, pageHtml, baseURL = baseURL): soup = BeautifulSoup(pageHtml, 'html.parser') pdfUrl= baseURL + soup.select(".break-all a")[0].get('href') fileName = name+'.pdf' print({'url': pdfUrl, 'fileName': fileName}) # print(getPageInfo(getDataFromURL(mainURL))) # collectionPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/1576') # print(getDownloadPageLinks(collectionPage)) # downloadPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/10976') # downloadPdf('Unit-4 Mathematical Induction', downloadPage)