`repr` and `int` take quadratic time in Python

问题

I was making a table of different run-times for Python 2.7, and noticed a thing that I cannot explain: The run-time of repr(2**n) and int('1'*n) is O(n^2). I always assumed that converting between integer and string would be O(n) with n being number of digits. The results show that if O(n) fitting gives ~30% error, while O(n^2) is only ~5%.

Could anyone explain please.

Here are the results that I get (codes are below):

Test Number-1 -- time to compute int('1'*n) (fit to O(n**2))
Spec_string:  1000<=n<=10000 by factors of 2
var_list ['n']
Function list: ('n**2', 'n', '1')
run times:
n =   1000 : 10.045052 microseconds
n =   2000 : 35.215855 microseconds
n =   4000 : 141.801834 microseconds
n =   8000 : 480.597973 microseconds
Coefficients as interpolated from data:
 7.17731e-06*n**2
+0.00487043*n
-2.0574*1
(measuring time in microseconds)
Sum of squares of residuals: 0.00673433934709
RMS error = 4.1 percent

Test Number-2 -- time to compute repr(2**n) (fit to O(n**2))
Spec_string:  1000<=n<=10000 by factors of 2
var_list ['n']
Function list: ('n**2', 'n', '1')
run times:
n =   1000 : 1.739025 microseconds
n =   2000 : 6.217957 microseconds
n =   4000 : 29.226065 microseconds
n =   8000 : 102.524042 microseconds
Coefficients as interpolated from data:
 1.72846e-06*n**2
-0.000434518*n
+0.433448*1
(measuring time in microseconds)
Sum of squares of residuals: 0.0139070140697
RMS error = 5.9 percent

Test Number-3 -- time to compute int('1'*n) (fit to O(n))
Spec_string:  1000<=n<=10000 by factors of 2
var_list ['n']
Function list: ('n', '1')
run times:
n =   1000 : 10.187864 microseconds
n =   2000 : 37.642002 microseconds
n =   4000 : 128.378153 microseconds
n =   8000 : 492.624998 microseconds
Coefficients as interpolated from data:
 0.0380857*n
-28.5106*1
(measuring time in microseconds)
Sum of squares of residuals: 0.268768241745
RMS error = 26 percent

Test Number-4 -- time to compute repr(2**n) (fit to O(n))
Spec_string:  1000<=n<=10000 by factors of 2
var_list ['n']
Function list: ('n', '1')
run times:
n =   1000 : 1.750946 microseconds
n =   2000 : 6.271839 microseconds
n =   4000 : 30.361176 microseconds
n =   8000 : 102.671146 microseconds
Coefficients as interpolated from data:
 0.0070098*n
-5.40096*1
(measuring time in microseconds)
Sum of squares of residuals: 0.467752717824
RMS error = 34 percent

Below are the codes and the results. You can ignore the support functions in the code (lg, sqrt, make_param_list, fit, fit2)

main.py:

from .support import *

def test_number():
    print
    print "Test Number-1 -- time to compute int('1'*n) (fit to O(n**2))"
    spec_string = "1000<=n<=10000"
    growth_factor = 2
    print "Spec_string: ",spec_string,"by factors of",growth_factor
    var_list, param_list = make_param_list(spec_string,growth_factor)
    f_list = ("n**2","n","1")
    run_times = []
    trials = 1000
    for D in param_list:
        t = timeit.Timer("string.atoi(x)","import string;x='1'*%(n)s"%D)
        run_times.append(t.timeit(trials)*1e6/float(trials))
    fit(var_list,param_list,run_times,f_list)

    print
    print "Test Number-2 -- time to compute repr(2**n) (fit to O(n**2))"
    spec_string = "1000<=n<=10000"
    growth_factor = 2
    print "Spec_string: ",spec_string,"by factors of",growth_factor
    var_list, param_list = make_param_list(spec_string,growth_factor)
    f_list = ("n**2","n","1")
    run_times = []
    trials = 1000
    for D in param_list:
        t = timeit.Timer("repr(x)","x=2**%(n)s"%D)
        run_times.append(t.timeit(trials)*1e6/float(trials))
    fit(var_list,param_list,run_times,f_list)

    print
    print "Test Number-3 -- time to compute int('1'*n) (fit to O(n))"
    spec_string = "1000<=n<=10000"
    growth_factor = 2
    print "Spec_string: ",spec_string,"by factors of",growth_factor
    var_list, param_list = make_param_list(spec_string,growth_factor)
    f_list = ("n","1")
    run_times = []
    trials = 1000
    for D in param_list:
        t = timeit.Timer("string.atoi(x)","import string;x='1'*%(n)s"%D)
        run_times.append(t.timeit(trials)*1e6/float(trials))
    fit(var_list,param_list,run_times,f_list)

    print
    print "Test Number-4 -- time to compute repr(2**n) (fit to O(n))"
    spec_string = "1000<=n<=10000"
    growth_factor = 2
    print "Spec_string: ",spec_string,"by factors of",growth_factor
    var_list, param_list = make_param_list(spec_string,growth_factor)
    f_list = ("n","1")
    run_times = []
    trials = 1000
    for D in param_list:
        t = timeit.Timer("repr(x)","x=2**%(n)s"%D)
        run_times.append(t.timeit(trials)*1e6/float(trials))
    fit(var_list,param_list,run_times,f_list)

if __name__ == '__main__':
    test_number()

support.py:

import math
import string
import timeit
import scipy.optimize

def lg(x):
    return math.log(x)/math.log(2.0)

def sqrt(x):
    return math.sqrt(x)

def make_param_list(spec_string,growth_factor):
    """
    Generate a list of dictionaries
    given maximum and minimum values for each range.
    Each min and max value is a *string* that can be evaluted;
    each string may depend on earlier variable values
    Values increment by factor of growth_factor from min to max
    Example:
       make_param_list("1<=n<=1000")
       make_param_list("1<=n<=1000;1<=m<=1000;min(n,m)<=k<=max(n,m)")
    """
    var_list = []
    spec_list = string.split(spec_string,";")
    D = {}
    D['lg']=lg
    D['sqrt'] = sqrt
    D_list = [D]
    for spec in spec_list:
        spec_parts = string.split(spec,"<=")
        assert len(spec_parts)==3
        lower_spec = spec_parts[0]
        var_name = spec_parts[1]
        assert len(var_name)==1
        var_list.append(var_name)
        upper_spec = spec_parts[2]
        new_D_list = []
        for D in D_list:
            new_D = D.copy()
            val = eval(lower_spec,D)
            while val<=eval(upper_spec,D):
                new_D[var_name] = val
                new_D_list.append(new_D.copy())
                val *= growth_factor
        D_list = new_D_list
    return (var_list,D_list)

def fit(var_list,param_list,run_times,f_list):
    """
    Return matrix A needed for least-squares fit.
    Given:
        list of variable names
        list of sample dicts for various parameter sets
        list of corresponding run times
        list of functions to be considered for fit
            these are *strings*, e.g. "n","n**2","min(n,m)",etc.
    prints:
        coefficients for each function in f_list
    """
    print "var_list",var_list
    print "Function list:",f_list
    print "run times:",
    for i in range(len(param_list)):
        print
        for v in var_list:
            print v,"= %6s"%param_list[i][v],
        print ": %8f"%run_times[i],"microseconds",
    print
    rows = len(run_times)
    cols = len(f_list)
    A = [ [0 for j in range(cols)] for i in range(rows) ]
    for i in range(rows):
        D = param_list[i]
        for j in range(cols):
            A[i][j] = float(eval(f_list[j],D))
    b = run_times

    (x,resids,rank,s) = fit2(A,b)

    print "Coefficients as interpolated from data:"
    for j in range(cols):
        sign = ''
        if x[j]>0 and j>0:
            sign="+"
        elif x[j]>0:
            sign = " "
        print "%s%g*%s"%(sign,x[j],f_list[j])

    print "(measuring time in microseconds)"
    print "Sum of squares of residuals:",resids
    print "RMS error = %0.2g percent"%(math.sqrt(resids/len(A))*100.0)

def fit2(A,b):
    """ Relative error minimizer """
    def f(x):
        assert len(x) == len(A[0])
        resids = []
        for i in range(len(A)):
            sum = 0.0
            for j in range(len(A[0])):
                sum += A[i][j]*x[j]
            relative_error = (sum-b[i])/b[i]
            resids.append(relative_error)
        return resids
    ans = scipy.optimize.leastsq(f,[0.0]*len(A[0]))
    # print "ans:",ans
    if len(A[0])==1:
        x = [ans[0]]
    else:
        x = ans[0]
    resids = sum([r*r for r in f(x)])
    return (x,resids,0,0)

UPDATE 1: Even more confusing!

I checked the python implementation of intobject, and it looks like it is supposed to be linear:

/* Convert an integer to a decimal string.  On many platforms, this
   will be significantly faster than the general arbitrary-base
   conversion machinery in _PyInt_Format, thanks to optimization
   opportunities offered by division by a compile-time constant. */
static PyObject *
int_to_decimal_string(PyIntObject *v) {
    char buf[sizeof(long)*CHAR_BIT/3+6], *p, *bufend;
    long n = v->ob_ival;
    unsigned long absn;
    p = bufend = buf + sizeof(buf);
    absn = n < 0 ? 0UL - n : n;
    do {
        *--p = '0' + (char)(absn % 10);
        absn /= 10;
    } while (absn);
    if (n < 0)
        *--p = '-';
    return PyString_FromStringAndSize(p, bufend - p);
}

来源：https://stackoverflow.com/questions/40822290/repr-and-int-take-quadratic-time-in-python

标签

python

int

runtime

asymptotic-complexity

repr