I'm starting in this web scraping and I'm trying to download all the images of an imgur post, there are more than 1500 but I only download the first 15, that's because the page is loading 15 in 15 images, how can i make the request.get (url) .content me of the fully loaded page? Or with some other library.
The code of my scraper is as follows:
from bs4 import BeautifulSoup as bs
import requests as r
from os import listdir as ls
from random import randint as rand
#funciones que ayudaran un poco.
abc="abcdefghijklmjopqrstuvwxyz"
abc=abc+abc.upper()+"1234567890_"
def get_ext(url):
return url[-4:]
def fix_rel(url,prot="http"):
if url[:len(prot)]==prot:
return url
else:
if url[:2]=="//":
return prot+":"+url
else:
return prot+":/"+url
def random_string(leng=5):
gen=str()
for el in range(0,leng):
gen=gen+abc[rand(0,len(abc)-1)]
return gen
def add(q):
try:
return str(int(q)+1)
except:
return str(q)+random_string(8)
def setfilename(dire,name="1",ext=".txt"):
if name+ext in ls(dire):
return setfilename(dire,add(name),ext)
else:
return name+ext
def down(url,dest,filename="download",ext=".txt"):
f=open(dest+setfilename(dest,filename,ext),"w")
f.write(r.get(url).content)
f.close()
#aqui empieza el web scraping
url="http://m.imgur.com/a/46UVO"
#url="http://m.imgur.com/gallery/hW9it"
dest="/sdcard/images/mlpedits/"
print "Descargando html..."
sopa=bs(ul.urlopen(url).read(),"html.parser")
i=0
ok=0
print "Descargando imagenes..."
imagenes=sopa.findAll("img")
c=len(imagenes)
for el in imagenes:
i+=1
try:
down(basic.fix_rel(el["src"]),dest,"a",basic.get_ext(el["src"]))
print str(i)+"/"+str(c)+" -> "+el["src"]
ok+=1
except:
print str(i)+"/"+str(c)+" x "+el["src"]
print "\n\n",str(ok)+"/"+str(i)+" descargados"