Scrapy upload file

前端 未结 2 572
说谎
说谎 2020-12-11 23:34

I am making a form request to a website using scrapy. The form requires to upload a pdf file, How can we do it in Scrapy. I am trying this like -

FormRequest         


        
相关标签:
2条回答
  • 2020-12-12 00:19

    I just spent an entire day trying to figure out how to implement this. Finally, I came upon a Scrapy pull request from 2016 that was never merged, with an implementation of a multipart form request:

    from scrapy import FormRequest
    from six.moves.urllib.parse import urljoin, urlencode
    import lxml.html
    from parsel.selector import create_root_node
    import six
    import string
    import random
    from scrapy.http.request import Request
    from scrapy.utils.python import to_bytes, is_listlike
    from scrapy.utils.response import get_base_url
    
    
    class MultipartFormRequest(FormRequest):
    
        def __init__(self, *args, **kwargs):
            formdata = kwargs.pop('formdata', None)
    
            kwargs.setdefault('method', 'POST')
    
            super(MultipartFormRequest, self).__init__(*args, **kwargs)
    
            content_type = self.headers.setdefault(b'Content-Type', [b'multipart/form-data'])[0]
            method = kwargs.get('method').upper()
            if formdata and method == 'POST' and content_type == b'multipart/form-data':
                items = formdata.items() if isinstance(formdata, dict) else formdata
                self._boundary = ''
    
                # encode the data using multipart spec
                self._boundary = to_bytes(''.join(
                    random.choice(string.digits + string.ascii_letters) for i in range(20)), self.encoding)
                self.headers[b'Content-Type'] = b'multipart/form-data; boundary=' + self._boundary
                request_data = _multpart_encode(items, self._boundary, self.encoding)
                self._set_body(request_data)
    
    
    class MultipartFile(object):
    
        def __init__(self, name, content, mimetype='application/octet-stream'):
            self.name = name
            self.content = content
            self.mimetype = mimetype
    
    
    def _get_form_url(form, url):
        if url is None:
            return urljoin(form.base_url, form.action)
        return urljoin(form.base_url, url)
    
    
    def _urlencode(seq, enc):
        values = [(to_bytes(k, enc), to_bytes(v, enc))
                  for k, vs in seq
                  for v in (vs if is_listlike(vs) else [vs])]
        return urlencode(values, doseq=1)
    
    
    def _multpart_encode(items, boundary, enc):
        body = []
    
        for name, value in items:
            body.append(b'--' + boundary)
            if isinstance(value, MultipartFile):
                file_name = value.name
                content = value.content
                content_type = value.mimetype
    
                body.append(
                    b'Content-Disposition: form-data; name="' + to_bytes(name, enc) + b'"; filename="' + to_bytes(file_name,
                                                                                                                  enc) + b'"')
                body.append(b'Content-Type: ' + to_bytes(content_type, enc))
                body.append(b'')
                body.append(to_bytes(content, enc))
            else:
                body.append(b'Content-Disposition: form-data; name="' + to_bytes(name, enc) + b'"')
                body.append(b'')
                body.append(to_bytes(value, enc))
    
        body.append(b'--' + boundary + b'--')
        return b'\r\n'.join(body)
    
    
    def _get_form(response, formname, formid, formnumber, formxpath):
        """Find the form element """
        root = create_root_node(response.text, lxml.html.HTMLParser,
                                base_url=get_base_url(response))
        forms = root.xpath('//form')
        if not forms:
            raise ValueError("No <form> element found in %s" % response)
    
        if formname is not None:
            f = root.xpath('//form[@name="%s"]' % formname)
            if f:
                return f[0]
    
        if formid is not None:
            f = root.xpath('//form[@id="%s"]' % formid)
            if f:
                return f[0]
    
        # Get form element from xpath, if not found, go up
        if formxpath is not None:
            nodes = root.xpath(formxpath)
            if nodes:
                el = nodes[0]
                while True:
                    if el.tag == 'form':
                        return el
                    el = el.getparent()
                    if el is None:
                        break
            encoded = formxpath if six.PY3 else formxpath.encode('unicode_escape')
            raise ValueError('No <form> element found with %s' % encoded)
    
        # If we get here, it means that either formname was None
        # or invalid
        if formnumber is not None:
            try:
                form = forms[formnumber]
            except IndexError:
                raise IndexError("Form number %d not found in %s" %
                                 (formnumber, response))
            else:
                return form
    
    
    def _get_inputs(form, formdata, dont_click, clickdata, response):
        try:
            formdata = dict(formdata or ())
        except (ValueError, TypeError):
            raise ValueError('formdata should be a dict or iterable of tuples')
    
        inputs = form.xpath('descendant::textarea'
                            '|descendant::select'
                            '|descendant::input[not(@type) or @type['
                            ' not(re:test(., "^(?:submit|image|reset)$", "i"))'
                            ' and (../@checked or'
                            '  not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
                            namespaces={
                                "re": "http://exslt.org/regular-expressions"})
        values = [(k, u'' if v is None else v)
                  for k, v in (_value(e) for e in inputs)
                  if k and k not in formdata]
    
        if not dont_click:
            clickable = _get_clickable(clickdata, form)
            if clickable and clickable[0] not in formdata and not clickable[0] is None:
                values.append(clickable)
    
        values.extend(formdata.items())
        return values
    
    
    def _value(ele):
        n = ele.name
        v = ele.value
        if ele.tag == 'select':
            return _select_value(ele, n, v)
        return n, v
    
    
    def _select_value(ele, n, v):
        multiple = ele.multiple
        if v is None and not multiple:
            # Match browser behaviour on simple select tag without options selected
            # And for select tags wihout options
            o = ele.value_options
            return (n, o[0]) if o else (None, None)
        elif v is not None and multiple:
            # This is a workround to bug in lxml fixed 2.3.1
            # fix https://github.com/lxml/lxml/commit/57f49eed82068a20da3db8f1b18ae00c1bab8b12#L1L1139
            selected_options = ele.xpath('.//option[@selected]')
            v = [(o.get('value') or o.text or u'').strip() for o in selected_options]
        return n, v
    
    
    def _get_clickable(clickdata, form):
        """
        Returns the clickable element specified in clickdata,
        if the latter is given. If not, it returns the first
        clickable element found
        """
        clickables = [
            el for el in form.xpath(
                'descendant::*[(self::input or self::button)'
                ' and re:test(@type, "^submit$", "i")]'
                '|descendant::button[not(@type)]',
                namespaces={"re": "http://exslt.org/regular-expressions"})
        ]
        if not clickables:
            return
    
        # If we don't have clickdata, we just use the first clickable element
        if clickdata is None:
            el = clickables[0]
            return (el.get('name'), el.get('value') or '')
    
        # If clickdata is given, we compare it to the clickable elements to find a
        # match. We first look to see if the number is specified in clickdata,
        # because that uniquely identifies the element
        nr = clickdata.get('nr', None)
        if nr is not None:
            try:
                el = list(form.inputs)[nr]
            except IndexError:
                pass
            else:
                return (el.get('name'), el.get('value') or '')
    
        # We didn't find it, so now we build an XPath expression out of the other
        # arguments, because they can be used as such
        xpath = u'.//*' + \
                u''.join(u'[@%s="%s"]' % c for c in six.iteritems(clickdata))
        el = form.xpath(xpath)
        if len(el) == 1:
            return (el[0].get('name'), el[0].get('value') or '')
        elif len(el) > 1:
            raise ValueError("Multiple elements found (%r) matching the criteria "
                             "in clickdata: %r" % (el, clickdata))
        else:
            raise ValueError('No clickable element matching clickdata: %r' % (clickdata,))
    

    This is the code I used to call the request (in my case I needed to upload an image):

    with open(img_path, 'rb') as file:
        img = file.read()
        file_name = os.path.basename(img_path)
        multipart_file = MultipartFile(file_name, img, "image/png")
        form_data = {
            "param": "value", # this is an example of a text parameter
            "PicUpload": multipart_file
        }
        yield MultipartFormRequest(url=upload_url, formdata=form_data,
                                   callback=self.my_callback)
    

    It's a shame that so much time has passed and Scrapy still doesn't have a built in way to do this, especially since someone wrote a very simple implementation years ago.

    0 讨论(0)
  • 2020-12-12 00:25

    At this very moment Scrapy has no built-in support for uploading files.

    File uploading via forms in HTTP was specified in RFC1867. According to the spec, an HTTP request with Content-Type: multipart/form-data is required (in your code it would be application/x-www-form-urlencoded).

    To achieve file uploading with Scrapy, you would need to:

    1. Get familiar with the basic concepts of HTTP file uploading.
    2. Start with scrapy.Request (instead of FormRequest).
    3. Give it a proper Content-Type header value.
    4. Build the request body yourself.

    See also: How does HTTP file upload work?

    0 讨论(0)
提交回复
热议问题