JustPaste.it

scrapy.lua

http=require'socket.http'

--iterator1 = io.lines("uniq.txt")

start_topic=0
end_topic=20000

urlbit1="http://www.volvo300mania.com/fr/forum/viewtopic.php?t="
urlbit2="&start="
urlbit3="&view=print"
for topic=start_topic,end_topic do
--for topic in iterator1 do
        old_url_text_length=0
        saved_pages=""
        for start_page=0, 15000, 15 do
            url=urlbit1..topic..urlbit2..start_page..urlbit3
            url_text, statusCode, headers, statusText = http.request(url)
            if statusCode ~=200 then
               print("Topic "..topic.." does not exist.")
               break
            end
            worked_OK, url_text_length=pcall(string.len,url_text)
            if not worked_OK then print("Error getting length of text for topic: "..topic) break
            elseif start_page==0 then print("New topic: "..topic)
            elseif url_text_length==old_url_text_length then
               print("Reached end of this topic.")
               break
            end
            old_url_text_length=url_text_length
            newfile=topic.."_"..start_page..".pdf"
            wkhtmltopdf_string="wkhtmltopdf ".."\""..url.."\" "..newfile
            --print(wkhtmltopdf_string)
            if pcall(os.execute, wkhtmltopdf_string)
               then
                   print("Saved: "..newfile)
                   saved_pages=saved_pages..newfile.." "
            else
                print("Error converting topic: "..topic.." to pdf.")
            end
        end
        combined_file=topic..".pdf"
        _, count = string.gsub(saved_pages, ".pdf ", ".pdf ")
        if count>0 then
          print("Saved pages: "..saved_pages)
          end
        if count>1 then
          pcall(os.execute, "pdfunite "..saved_pages..combined_file)
          print("Combined as: "..combined_file)
          pcall(os.execute, "rm "..saved_pages)
          print("Removed originals: "..saved_pages)
        elseif count==1 then
          pcall(os.execute, "mv "..saved_pages..combined_file)
          print("Renamed original: "..saved_pages.." to: "..combined_file)
        end
end