Convert HTML into CSV

后端 未结 5 1858

I want to convert a HTML table as obtained from the script below into a CSV file, but got type error as follows:

TypeError: sequence item 0: expected

相关标签:
5条回答
  • 2020-11-29 08:14

    Here is another way without using csv module:

    fp=open('data.csv','w')
    for row in rows[:-1]:   # Removed last row as it has empty cells that gives error which can also be resolved if needed 
          fp.write(row.get_text(',') + '\n')
    fp.close()
    

    You can directly open data.csv file.

    Station details can be get by below command:

    >>>> table = soup.find_all('table', class_='data2_s')
    >>>> print table[0].find_all('caption')[0].get_text().encode('ascii', 'ignore')
    WAKKANAI   WMO Station ID:47401 Lat 45o24.9'N  Lon 141o40.7'E
    

    Hope this helps.

    0 讨论(0)
  • 2020-11-29 08:19
    import csv
    from bs4 import BeautifulSoup
    import pandas as pd
    
    html = open('test.html').read()
    soup = BeautifulSoup(html, features='lxml')
    #Specify table name which you want to read.
    #Example: <table class="queryResults" border="0" cellspacing="1">
    table = soup.select_one('table.queryResults')
    
    def get_all_tables(soup):
        return soup.find_all("table")
    
    
    tbls = get_all_tables(soup)
    for i, tablen in enumerate(tbls, start=1):
        print(i)
        print(tablen)
    
    def get_table_headers(table):
        headers = []
        for th in table.find("tr").find_all("th"):
            headers.append(th.text.strip())
        return headers
    
    head = get_table_headers(table)
    #print(head)
    
    def get_table_rows(table):    
        rows = []
        for tr in table.find_all("tr")[1:]:
            cells = []
            # grab all td tags in this table row
            tds = tr.find_all("td")
            if len(tds) == 0:
                # if no td tags, search for th tags
                # can be found especially in wikipedia tables below the table
                ths = tr.find_all("th")
                for th in ths:
                    cells.append(th.text.strip())
            else:
                # use regular td tags
                for td in tds:
                    cells.append(td.text.strip())
            rows.append(cells)
        return rows
    
    table_rows = get_table_rows(table)
    #print(table_rows)
    
    def save_as_csv(table_name, headers, rows):
        pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")
    
    save_as_csv("Test_table", head, table_rows)
    
    0 讨论(0)
  • 2020-11-29 08:20

    This is a job for the csv lib, getting each td inside each row and extracting the text, it will handle where there are missing values in each row:

    from bs4 import BeautifulSoup
    import urllib2
    import csv
    
    url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/monthly_s3_en.php?block_no=47401&view=1'
    html = urllib2.urlopen(url).read()
    soup = BeautifulSoup(html)
    table = soup.select_one("table.data2_s")
    # python3 just use th.text
    headers = [th.text.encode("utf-8") for th in table.select("tr th")]
    
    with open("out.csv", "w") as f:
        wr = csv.writer(f)
        wr.writerow(headers)
        wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")] for row in table.select("tr + tr")])
    

    Which matches the table exactly as you see on the page:

    :~$ cat out.csv
    Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
    1938,-5.2,-4.9,-0.6,4.7,9.5,11.6,17.9,22.2,16.5,10.7,3.3,-4.7,6.8
    1939,-7.5,-6.6,-1.4,4.0,7.5,13.0,17.4,20.0,17.4,9.7,3.0,-2.5,6.2
    1940,-6.0,-5.7,-0.5,3.5,8.5,11.0,16.6,19.7,15.6,10.4,3.7,-1.0,6.3
    1941,-6.5,-5.8,-2.6,3.6,8.1,11.4,12.7,16.5,16.0,10.0,4.0,-2.9,5.4
    1942,-7.8,-8.2,-0.8,3.5,7.1,12.0,17.4,18.4,15.7,10.5,2.5,-2.9,5.6
    1943,-4.1,-6.1,-1.1,3.5,6.9,12.9,19.3,21.5,17.5,11.7,1.2,-3.6,6.6
    1944,-7.7,-7.9,-2.2,1.7,8.9,13.7,19.0,21.3,16.6,10.8,1.3,-6.0,5.8
    1945,-7.8,-6.9,-1.8,3.9,5.5,11.0,13.6,18.7,16.8,11.0,3.9,-4.8,5.3
    1946,-6.5,-6.0,-3.3,4.5,7.6,14.9,18.2,22.2,16.9,11.5,4.4,-2.5,6.8
    1947,-4.9,-5.5,-2.3,3.7,9.0,11.2,17.1,19.3,15.1,10.6,2.4,-4.6,5.9
    1948,-2.7,-4.4,-0.2,6.0,10.7,12.2,16.2,22.0,16.9,11.1,4.2,-0.6,7.6
    1949,-2.6,-2.8,-3.4,2.0,9.4,11.8,16.9,20.8,17.8,10.8,3.1,-3.8,6.7
    1950,-5.7,-4.8,-1.3,4.0,9.2,14.6,19.3,22.6,16.8,9.0,3.0,-2.9,7.0
    1951,-6.7,-6.5,-2.2,3.7,9.5,12.3,16.7,22.3,15.6,10.1,3.7,-0.3,6.5
    1952,-5.7,-7.1,-2.4,3.8,8.3,13.1,16.4,19.7,17.0,11.3,0.9,-7.1,5.7
    1953,-7.7,-7.3,-0.9,3.6,6.9,11.1,16.8,19.2,17.6,11.2,-0.6,-2.6,5.6
    1954,-6.7,-4.1,-2.5,4.0,7.5,11.0,13.7,17.0,17.2,9.5,3.2,-1.8,5.7
    1955,-6.4,-4.8,-1.3,4.7,7.0,12.7,20.3,19.5,15.5,10.6,3.6,-0.4,6.8
    1956,-6.1,-4.6,-2.0,5.1,10.8,11.2,13.8,16.3,17.2,12.3,2.8,-2.6,6.2
    1957,-3.9,-5.5,-2.9,4.4,9.3,10.9,17.1,18.2,15.5,11.1,5.4,-1.1,6.5
    1958,-4.9,-4.9,-2.3,4.4,8.5,12.6,17.5,18.3,16.8,10.6,4.5,-0.5,6.7
    1959,-7.3,-2.8,0.8,6.4,9.4,12.7,17.1,18.5,16.2,11.6,2.9,-3.9,6.8
    1960,-7.2,-5.2,-1.4,3.5,7.7,10.8,15.9,20.8,18.1,9.7,3.3,-3.9,6.0
    1961,-7.7,-5.3,-1.4,5.5,8.7,14.7,19.5,20.0,18.9,10.4,4.1,-1.3,7.2
    1962,-4.2,-5.4,-2.5,6.7,10.0,12.9,16.8,17.7,16.6,9.9,2.6,-1.5,6.6
    1963,-3.6,-3.7,0.1,5.0,10.4,12.4,16.8,17.1,15.6,10.7,4.3,-1.7,7.0
    1964,-4.5,-7.7,-1.3,3.7,9.9,11.9,15.3,17.7,14.9,10.0,3.6,-1.9,6.0
    1965,-4.1,-5.7,-2.8,3.2,9.1,13.3,15.2,18.8,15.8,11.4,2.1,-2.6,6.1
    1966,-5.0,-5.5,-1.0,3.2,8.1,12.2,15.3,17.5,15.4,11.6,4.1,-4.4,6.0
    1967,-6.8,-5.9,-0.7,4.5,10.0,11.4,16.4,20.5,15.5,11.0,1.8,-1.5,6.4
    1968,-4.2,-4.7,1.9,5.7,8.9,14.5,17.3,18.1,15.9,9.1,5.3,-0.7,7.3
    1969,-7.3,-7.5,-2.5,3.9,7.2,10.6,17.0,16.5,16.1,9.4,2.2,-5.4,5.0
    1970,-6.6,-6.0,-4.2,4.6,10.4,12.9,17.4,19.2,16.8,10.5,4.3,-3.3,6.3
    1971,-6.3,-6.4,-1.7,4.1,7.6,11.6,15.8,17.2,15.2,11.5,3.4,-2.2,5.8
    1972,-5.3,-5.0,-0.6,5.9,9.4,12.8,16.8,20.4,15.7,10.9,1.9,-1.4,6.8
    1973,-4.2,-5.3,-2.9,4.2,8.4,12.8,17.0,20.9,17.1,10.4,3.5,-1.9,6.7
    1974,-2.6,-4.6,-2.1,4.0,8.4,11.8,16.8,18.8,16.5,10.1,1.9,-5.7,6.1
    1975,-4.1,-6.1,-1.5,4.3,8.4,13.7,16.1,20.6,17.3,10.4,3.8,-3.8,6.6
    1976,-4.6,-3.5,-1.4,4.0,8.9,11.9,17.5,17.6,15.7,10.2,1.3,-2.0,6.3
    1977,-8.3,-7.1,-1.0,3.6,8.0,11.9,18.2,19.1,17.4,11.4,4.5,-1.8,6.3
    1978,-6.7,-9.2,-1.6,4.3,9.2,13.5,20.6,21.3,17.4,9.6,3.4,-2.1,6.6
    1979,-6.9,-4.5,-2.5,2.7,7.8,13.2,15.8,20.3,16.9,11.3,2.9,-0.1,6.4
    1980,-5.4,-7.1,-1.9,1.9,7.8,12.9,15.9,16.5,16.0,10.0,4.3,-0.6,5.9
    1981,-5.4,-6.3,-2.6,5.6,8.1,11.8,17.1,18.7,16.0,10.5,0.8,-0.6,6.1
    1982,-5.6,-5.3,-0.6,3.7,9.0,11.9,16.9,21.0,17.5,11.4,4.3,-1.0,6.9
    1983,-4.2,-7.6,-1.9,6.8,8.2,8.5,14.5,18.9,15.8,8.9,4.8,-2.1,5.9
    1984,-4.9,-6.6,-3.3,2.9,7.9,15.5,19.5,20.5,16.6,9.2,2.3,-3.6,6.3
    1985,-8.7,-4.8,-1.4,4.9,8.6,11.7,16.6,21.1,15.7,10.3,2.7,-4.2,6.0
    1986,-7.2,-6.5,-2.4,4.6,8.4,11.2,14.4,19.6,16.8,9.1,2.1,-1.9,5.7
    1987,-6.4,-5.6,-1.4,4.2,8.6,12.6,17.5,18.0,16.4,11.1,2.0,-3.1,6.2
    1988,-4.8,-6.3,-1.8,4.1,8.0,12.6,14.1,20.4,16.1,10.4,2.0,-1.5,6.1
    1989,-2.6,-2.4,0.8,4.0,8.2,10.7,18.4,20.4,16.8,10.8,4.8,-1.3,7.4
    1990,-5.7,-2.4,1.4,5.7,9.3,13.4,18.9,20.3,17.1,13.3,6.2,1.2,8.2
    1991,-1.6,-3.6,-1.5,4.8,10.1,14.3,16.2,19.0,16.6,11.8,3.5,-2.3,7.3
    1992,-3.6,-3.6,-0.4,3.7,8.1,12.1,17.6,18.0,14.9,11.1,3.2,-1.2,6.7
    1993,-2.7,-3.3,-0.2,3.1,8.6,10.7,15.6,17.6,16.3,11.1,3.7,-1.6,6.6
    1994,-6.1,-2.7,-1.3,4.4,10.0,12.8,17.4,21.7,17.5,11.8,4.3,-2.9,7.2
    1995,-4.0,-4.0,-0.8,4.8,11.0,12.7,18.4,19.3,16.3,12.3,5.2,-0.6,7.6
    1996,-4.6,-4.5,-1.0,3.5,6.9,12.0,15.9,18.7,16.8,10.4,2.3,-2.4,6.2
    1997,-3.0,-3.3,-1.5,4.3,7.3,11.7,17.4,17.2,16.1,10.3,6.4,-0.7,6.9
    1998,-6.9,-5.1,0.3,5.3,10.1,12.9,15.5,18.1,17.2,12.5,2.0,-2.4,6.6
    1999,-4.1,-5.6,-2.6,4.2,8.4,14.5,16.6,21.0,18.3,11.2,3.8,-1.9,7.0
    2000,-4.2,-5.6,-2.1,3.5,9.3,12.8,18.9,21.5,17.7,10.6,1.5,-4.1,6.7
    2001,-6.3,-7.7,-2.4,4.7,8.5,13.0,17.4,18.7,15.6,10.8,4.0,-4.2,6.0
    2002,-3.6,-1.0,0.5,6.8,11.1,12.1,15.7,17.1,17.0,10.8,2.3,-4.4,7.0
    2003,-4.7,-5.6,-0.7,5.3,10.1,13.9,14.3,18.4,16.6,11.3,4.5,-1.4,6.8
    2004,-3.9,-3.0,-0.5,4.4,10.6,14.6,16.8,19.7,17.8,11.8,5.9,-2.0,7.7
    2005,-4.6,-5.7,-1.0,3.9,7.0,14.3,16.7,21.0,17.9,12.6,4.9,-2.3,7.1
    2006,-5.5,-4.7,-0.9,2.1,9.3,11.9,18.4,21.6,17.7,11.0,4.5,-1.8,7.0
    2007,-3.7,-3.2,-0.7,3.5,7.6,14.3,16.7,20.4,17.0,10.9,3.0,-1.7,7.0
    2008,-6.0,-4.8,0.6,6.0,8.3,11.9,17.9,18.8,17.9,11.5,3.8,-0.4,7.1
    2009,-2.4,-4.4,0.0,4.5,10.0,12.3,14.8,18.6,16.9,11.4,3.1,-2.2,6.9
    2010,-3.4,-4.9,-1.4,3.5,7.3,15.0,18.1,22.4,18.4,11.4,4.8,-1.1,7.5
    2011,-5.1,-2.2,-0.6,4.4,6.5,12.8,17.5 ),21.5,18.3,12.1,4.9,-2.3,7.3
    2012,-5.4,-6.4,-2.4,4.6,8.9,12.6,17.2,20.4,19.4,11.8,3.8,-3.0,6.8
    2013,-5.8,-5.1,-1.3,4.5,7.2,14.0,18.9,20.2,17.6,11.8,5.5,-0.2,7.3
    2014,-5.3,-4.2,-1.2,3.9,8.7,13.9,19.2,20.0,16.7,11.0,4.8,-2.3,7.1
    2015,-2.9,-1.7,2.3,5.9,9.9,12.1,17.6,19.0,17.3,10.4,3.7,-0.2,7.8
    2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ], , , , ,5.2 ]
    

    if you want the caption use table.select_one("caption.m").text:

    with open("out.csv", "w") as f:
        wr = csv.writer(f)
        wr.writerow([table.select_one("caption.m").text.encode("utf-8")])
        wr.writerow(headers)
        wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")] 
    for row in table.select("tr + tr")])
    

    but it might be an idea to use that as the name of the file as opposed to adding it to the csv.

    If you really want to do it without the csv, use the same logic with str.join:

    table = soup.select_one("table.data2_s")
    headers = [th.text.encode("utf-8") for th in table.select("tr th")]
    
    with open("out.csv", "w") as f:
        f.write(",".join(headers) + "\n")
        f.writelines(",".join([td.text.encode("utf-8") for td in row.find_all("td")]) + "\n" 
                     for row in table.select("tr + tr"))
    

    If you want to replace the empty cells with N/A:

    with open("out.csv", "w") as f:
        f.write(",".join(headers) + "\n")
        f.writelines(",".join([td.text.encode("utf-8").strip('\xe3\x80\x80') or "N/A" for td in row.find_all("td")]) + "\n"
                     for row in table.select("tr + tr"))
    

    Which will change the last row to:

    2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ],N/A,N/A,N/A,N/A,5.2 ]
    

    The spaces for missing values are unicode ideographic space characters (u"\u3000" in python) which when encoded to utf-8 become and strip, if that leave an empty string then we just use "N/A"

    In [7]: print u"\u3000"
     
    In [8]:  u"\u3000".encode("utf-8")
    Out[8]: '\xe3\x80\x80'
    In [9]:  u"\u3000".encode("utf-8").strip('\xe3\x80\x80')
    Out[9]: ''
    
    0 讨论(0)
  • 2020-11-29 08:27

    Use the csv module from Python to do this. You can obviously write more columns if you want, but the idea is that you're writing a list to the csv file. There are other options that you can specify in the writer() method if you'd like to quote things, escape things, etc.

    import csv
    
    with open('your_csv_name.csv', 'w') as o:
        w = csv.writer(o)
        # Headers
        w.writerow(['tr_content'])
        # Write the tr text
        for r in rows:
            w.writerow([r])
    
    0 讨论(0)
  • 2020-11-29 08:34
    {'ip': '165.225.106.65', 'status': 'BLACKLISTED 3/114', 'location': ' (IN) India', 'isp': 'Zscaler', 'asn': 'AS53813 - ZSCALER-INC', 'hostname': 'Unknown'}
    
    0 讨论(0)
提交回复
热议问题