how to convert xls to xlsx

牧云@^-^@ 提交于 2019-11-27 03:51:32
Ray

I've had to do this before. The main idea is to use the xlrd module to open and parse a xls file and write the content to a xlsx file using the openpyxl module.

Here's my code. Attention! It cannot handle complex xls files, you should add you own parsing logic if you are going to use it.

import xlrd
from openpyxl.workbook import Workbook
from openpyxl.reader.excel import load_workbook, InvalidFileException

def open_xls_as_xlsx(filename):
    # first open using xlrd
    book = xlrd.open_workbook(filename)
    index = 0
    nrows, ncols = 0, 0
    while nrows * ncols == 0:
        sheet = book.sheet_by_index(index)
        nrows = sheet.nrows
        ncols = sheet.ncols
        index += 1

    # prepare a xlsx sheet
    book1 = Workbook()
    sheet1 = book1.get_active_sheet()

    for row in xrange(0, nrows):
        for col in xrange(0, ncols):
            sheet1.cell(row=row, column=col).value = sheet.cell_value(row, col)

    return book1

You need to have win32com installed on your machine. Here is my code:

import win32com.client as win32
fname = "full+path+to+xls_file"
excel = win32.gencache.EnsureDispatch('Excel.Application')
wb = excel.Workbooks.Open(fname)

wb.SaveAs(fname+"x", FileFormat = 51)    #FileFormat = 51 is for .xlsx extension
wb.Close()                               #FileFormat = 56 is for .xls extension
excel.Application.Quit()

Here is my solution, without considering fonts, charts and images:

$ pip install pyexcel pyexcel-xls pyexcel-xlsx

Then do this::

import pyexcel as p

p.save_book_as(file_name='your-file-in.xls',
               dest_file_name='your-new-file-out.xlsx')

If you do not need a program, you could install one additinal package pyexcel-cli::

$ pip install pyexcel-cli
$ pyexcel transcode your-file-in.xls your-new-file-out.xlsx

The transcoding procedure above uses xlrd and openpyxl.

I found none of answers here 100% right. So I post my codes here:

import xlrd
from openpyxl.workbook import Workbook

def cvt_xls_to_xlsx(src_file_path, dst_file_path):
    book_xls = xlrd.open_workbook(src_file_path)
    book_xlsx = Workbook()

    sheet_names = book_xls.sheet_names()
    for sheet_index in range(0,len(sheet_names)):
        sheet_xls = book_xls.sheet_by_name(sheet_names[sheet_index])
        if sheet_index == 0:
            sheet_xlsx = book_xlsx.active()
            sheet_xlsx.title = sheet_names[sheet_index]
        else:
            sheet_xlsx = book_xlsx.create_sheet(title=sheet_names[sheet_index])

        for row in range(0, sheet_xls.nrows):
            for col in range(0, sheet_xls.ncols):
                sheet_xlsx.cell(row = row+1 , column = col+1).value = sheet_xls.cell_value(row, col)

    book_xlsx.save(dst_file_path)
Malexandre

The answer by Ray helped me a lot, but for those who search a simple way to convert all the sheets from a xls to a xlsx, I made this Gist:

import xlrd
from openpyxl.workbook import Workbook as openpyxlWorkbook

# content is a string containing the file. For example the result of an http.request(url).
# You can also use a filepath by calling "xlrd.open_workbook(filepath)".

xlsBook = xlrd.open_workbook(file_contents=content)
workbook = openpyxlWorkbook()

for i in xrange(0, xlsBook.nsheets):
    xlsSheet = xlsBook.sheet_by_index(i)
    sheet = workbook.active if i == 0 else workbook.create_sheet()
    sheet.title = xlsSheet.name

    for row in xrange(0, xlsSheet.nrows):
        for col in xrange(0, xlsSheet.ncols):
            sheet.cell(row=row, column=col).value = xlsSheet.cell_value(row, col)

# The new xlsx file is in "workbook", without iterators (iter_rows).
# For iteration, use "for row in worksheet.rows:".
# For range iteration, use "for row in worksheet.range("{}:{}".format(startCell, endCell)):".

You can find the xlrd lib here and the openpyxl here (you must download xlrd in your project for Google App Engine for example).

Jhon Anderson

I'm improve performance for @Jackypengyu method.

Merged cells will be converted too.

Results

Convert same 12 files in same order:

Original:

0:00:01.958159
0:00:02.115891
0:00:02.018643
0:00:02.057803
0:00:01.267079
0:00:01.308073
0:00:01.245989
0:00:01.289295
0:00:01.273805
0:00:01.276003
0:00:01.293834
0:00:01.261401

Improved:

0:00:00.774101
0:00:00.734749
0:00:00.741434
0:00:00.744491
0:00:00.320796
0:00:00.279045
0:00:00.315829
0:00:00.280769
0:00:00.316380
0:00:00.289196
0:00:00.347819
0:00:00.284242

Solution

def cvt_xls_to_xlsx(*args, **kw):
    """Open and convert XLS file to openpyxl.workbook.Workbook object

    @param args: args for xlrd.open_workbook
    @param kw: kwargs for xlrd.open_workbook
    @return: openpyxl.workbook.Workbook


    You need -> from openpyxl.utils.cell import get_column_letter
    """

    book_xls = xlrd.open_workbook(*args, formatting_info=True, ragged_rows=True, **kw)
    book_xlsx = Workbook()

    sheet_names = book_xls.sheet_names()
    for sheet_index in range(len(sheet_names)):
        sheet_xls = book_xls.sheet_by_name(sheet_names[sheet_index])

        if sheet_index == 0:
            sheet_xlsx = book_xlsx.active
            sheet_xlsx.title = sheet_names[sheet_index]
        else:
            sheet_xlsx = book_xlsx.create_sheet(title=sheet_names[sheet_index])

        for crange in sheet_xls.merged_cells:
            rlo, rhi, clo, chi = crange

            sheet_xlsx.merge_cells(
                start_row=rlo + 1, end_row=rhi,
                start_column=clo + 1, end_column=chi,
            )

        def _get_xlrd_cell_value(cell):
            value = cell.value
            if cell.ctype == xlrd.XL_CELL_DATE:
                value = datetime.datetime(*xlrd.xldate_as_tuple(value, 0))

            return value

        for row in range(sheet_xls.nrows):
            sheet_xlsx.append((
                _get_xlrd_cell_value(cell)
                for cell in sheet_xls.row_slice(row, end_colx=sheet_xls.row_len(row))
            ))

        for rowx in range(sheet_xls.nrows):
            if sheet_xls.rowinfo_map[rowx].hidden != 0:
                print sheet_names[sheet_index], rowx
                sheet_xlsx.row_dimensions[rowx+1].hidden = True
        for coly in range(sheet_xls.ncols):
            if sheet_xls.colinfo_map[coly].hidden != 0:
                print sheet_names[sheet_index], coly
                coly_letter = get_column_letter(coly+1)
                sheet_xlsx.column_dimensions[coly_letter].hidden = True

    return book_xlsx

Simple solution

I required a simple solution to convert couple of xlx to xlsx format. There are plenty of answers here, but they are doing some "magic" that I do not completely understand.

A simple solution was given by chfw, but not quite complete.

Install dependencies

Use pip to install

pip install pyexcel-cli pyexcel-xls pyexcel-xlsx

Execute

All the styling and macros will be gone, but the information is intact.

For single file

pyexcel transcode your-file-in.xls your-new-file-out.xlsx

For all files in the folder, one liner

for file in *.xls; do; echo "Transcoding $file"; pyexcel transcode "$file" "${file}x"; done;

I tried @Jhon Anderson's solution, works well but got an "year is out of range" error when there are cells of time format like HH:mm:ss without date. There for I improved the algorithm again:

def xls_to_xlsx(*args, **kw):
"""
    open and convert an XLS file to openpyxl.workbook.Workbook
    ----------
    @param args: args for xlrd.open_workbook
    @param kw: kwargs for xlrd.open_workbook
    @return: openpyxl.workbook.Workbook对象
    """
    book_xls = xlrd.open_workbook(*args, formatting_info=True, ragged_rows=True, **kw)
    book_xlsx = openpyxl.workbook.Workbook()

    sheet_names = book_xls.sheet_names()
    for sheet_index in range(len(sheet_names)):
        sheet_xls = book_xls.sheet_by_name(sheet_names[sheet_index])
        if sheet_index == 0:
            sheet_xlsx = book_xlsx.active
            sheet_xlsx.title = sheet_names[sheet_index]
        else:
            sheet_xlsx = book_xlsx.create_sheet(title=sheet_names[sheet_index])
        for crange in sheet_xls.merged_cells:
            rlo, rhi, clo, chi = crange
            sheet_xlsx.merge_cells(start_row=rlo + 1, end_row=rhi,
            start_column=clo + 1, end_column=chi,)

        def _get_xlrd_cell_value(cell):
            value = cell.value
            if cell.ctype == xlrd.XL_CELL_DATE:
                datetime_tup = xlrd.xldate_as_tuple(value,0)    
                if datetime_tup[0:3] == (0, 0, 0):   # time format without date
                    value = datetime.time(*datetime_tup[3:])
                else:
                    value = datetime.datetime(*datetime_tup)
            return value

        for row in range(sheet_xls.nrows):
            sheet_xlsx.append((
                _get_xlrd_cell_value(cell)
                for cell in sheet_xls.row_slice(row, end_colx=sheet_xls.row_len(row))
            ))
    return book_xlsx

Then work perfect!

The Answer from Ray was clipping the first row and last column of the data. Here is my modified solution (for python3):

def open_xls_as_xlsx(filename):
# first open using xlrd
book = xlrd.open_workbook(filename)
index = 0
nrows, ncols = 0, 0
while nrows * ncols == 0:
    sheet = book.sheet_by_index(index)
    nrows = sheet.nrows+1   #bm added +1
    ncols = sheet.ncols+1   #bm added +1
    index += 1

# prepare a xlsx sheet
book1 = Workbook()
sheet1 = book1.get_active_sheet()

for row in range(1, nrows):
    for col in range(1, ncols):
        sheet1.cell(row=row, column=col).value = sheet.cell_value(row-1, col-1) #bm added -1's

return book1
lordwilliamsr

CONVERT XLS FILE TO XLSX

Using python3.6 I have just come accross the same issue and after hours of struggle I solved it by doing the ff, you probably wont need all of the packages: (I will be as clear as posslbe)

make sure to install the following packages before proceeding

pip install pyexcel, pip install pyexcel-xls, pip install pyexcel-xlsx,

pip install pyexcel-cli

step 1:

import pyexcel

step 2: "example.xls","example.xlsx","example.xlsm"

sheet0 = pyexcel.get_sheet(file_name="your_file_path.xls", name_columns_by_row=0)

step3: create array from contents

xlsarray = sheet.to_array() 

step4: check variable contents to verify

xlsarray

step5: pass the array held in variable called (xlsarray) to a new workbook variable called(sheet1)

sheet1 = pyexcel.Sheet(xlsarray)

step6: save the new sheet ending with .xlsx (in my case i want xlsx)

sheet1.save_as("test.xlsx")

Tried @Jhon's solution 1st, then I turned into pyexcel as a solution

pyexcel.save_as(file_name=oldfilename, dest_file_name=newfilename)

It works properly until I tried to package my project to a single exe file by PyInstaller, I tried all hidden imports option, following error still there:

  File "utils.py", line 27, in __enter__
    pyexcel.save_as(file_name=self.filename, dest_file_name=newfilename)
  File "site-packages\pyexcel\core.py", line 77, in save_as
  File "site-packages\pyexcel\internal\core.py", line 22, in get_sheet_stream
  File "site-packages\pyexcel\plugins\sources\file_input.py", line 39, in get_da
ta
  File "site-packages\pyexcel\plugins\parsers\excel.py", line 19, in parse_file
  File "site-packages\pyexcel\plugins\parsers\excel.py", line 40, in _parse_any
  File "site-packages\pyexcel_io\io.py", line 73, in get_data
  File "site-packages\pyexcel_io\io.py", line 91, in _get_data
  File "site-packages\pyexcel_io\io.py", line 188, in load_data
  File "site-packages\pyexcel_io\plugins.py", line 90, in get_a_plugin
  File "site-packages\lml\plugin.py", line 290, in load_me_now
  File "site-packages\pyexcel_io\plugins.py", line 107, in raise_exception
pyexcel_io.exceptions.SupportingPluginAvailableButNotInstalled: Please install p
yexcel-xls
[3192] Failed to execute script

Then, I jumped to pandas:

pd.read_excel(oldfilename).to_excel(newfilename, sheet_name=self.sheetname,index=False)
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!