My Scrapy spider works on everything, but extracting the url through xpath @href does not extract the complete url, always missing the final part of the url from the "? ......." sign which marks the variable GET of the url. Does anyone know if Scrapy has some character limit or something? It is the first time that happens to me, the rest of the spiders that I throw to other pages are perfectly fucnionan. Thanks for the answers.
I include the complete code of the spider:
import scrapy
class autoscouth(scrapy.Spider):
name = "autoscouth"
start_urls = [
'https://www.autoscout24.es/lst/audi?sort=standard&desc=0&offer=J%2CU%2CO%2CD%2CS&ustate=N%2CU&cy=E&atype=C',
]
def parse(self, response):
# for autos in response.xpath('//div[@class="cl-listing-elements"]/div[@class="cl-ssi-fragment "]/div[@class="cl-list-elements"]'):
for autos in response.xpath('//*[@class="cldt-summary-full-item"]'):
yield {
'url' : 'https://www.autoscouth' + autos.xpath('.//div[1]/div[1]/div[1]/a/@href').extract_first(),
'marca' : autos.xpath('.//div[1]/div[1]/div[1]/a/div[1]/h2[1]/text()').extract_first(),
'version' : autos.xpath('.//div[1]/div[1]/div[1]/a/div[1]/h2[2]/text()').extract_first(),
'combustible' : autos.xpath('.//div[1]/div[3]/div[2]/ul/li[7]/text()').extract_first(),
'precio' : autos.xpath('.//div[1]/div[3]/div[1]/div[1]/span/text()').extract_first(),
'cv' : autos.xpath('.//div[1]/div[3]/div[2]/ul/li[3]/text()').extract_first(),
'km' : autos.xpath('.//div[1]/div[3]/div[2]/ul/li[1]/text()').extract_first(),
'anio' : autos.xpath('.//div[1]/div[3]/div[2]/ul/li[2]/text()').extract_first(),
'ciudad' : autos.xpath('//div[2]/div/div[2]/div/div[2]/div[2]/span[2]/text()').extract_first(),
}