68 lines
2.0 KiB
Python
68 lines
2.0 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
|
|
# mainURL = input('Please enter the URL of eGyanKosh page\n: ')
|
|
baseURL = 'https://egyankosh.ac.in'
|
|
mainURL = 'https://egyankosh.ac.in/handle/123456789/404'
|
|
print(mainURL)
|
|
|
|
def getDataFromURL(url):
|
|
r = requests.get(url, verify=False)
|
|
if r.status_code != 200:
|
|
assert False
|
|
return r.text
|
|
|
|
def stripEmptySpace(html):
|
|
html = str(html)
|
|
html = "".join(line.strip() for line in html.split("\n"))
|
|
return BeautifulSoup(html, 'html.parser')
|
|
|
|
def getPageInfo(html):
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
pageTitle = soup.h2.text.split('\n')[0]
|
|
pageType = soup.select(".col-md-8 small")[0].string
|
|
return {'pageTitle': pageTitle, 'pageType': pageType}
|
|
|
|
def getSubPages(html, baseUrl = baseURL):
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
table = soup.select('.col-md-9 .list-group')[0]
|
|
rows = table.select(".list-group-item.row")
|
|
subpages = []
|
|
for row in rows:
|
|
row = stripEmptySpace(row)
|
|
info = {
|
|
'name': row.string,
|
|
'url': baseURL + row.a.get('href')
|
|
}
|
|
subpages.append(info)
|
|
return subpages
|
|
|
|
def getDownloadPageLinks(html, baseURL = baseURL):
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
anchors = soup.table.findAll('a')
|
|
links = []
|
|
for anchor in anchors:
|
|
link = {
|
|
'name': anchor.string,
|
|
'url': baseURL + anchor.get('href')
|
|
}
|
|
links.append(link)
|
|
return links
|
|
|
|
def downloadPdf(name, pageHtml, baseURL = baseURL):
|
|
soup = BeautifulSoup(pageHtml, 'html.parser')
|
|
pdfUrl= baseURL + soup.select(".break-all a")[0].get('href')
|
|
fileName = name+'.pdf'
|
|
print({'url': pdfUrl, 'fileName': fileName})
|
|
|
|
|
|
|
|
|
|
# print(getPageInfo(getDataFromURL(mainURL)))
|
|
|
|
# collectionPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/1576')
|
|
# print(getDownloadPageLinks(collectionPage))
|
|
|
|
# downloadPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/10976')
|
|
# downloadPdf('Unit-4 Mathematical Induction', downloadPage) |