Standard Deviation for SQLite

后端 未结 10 673
生来不讨喜
生来不讨喜 2020-12-14 08:05

I\'ve searched the SQLite docs and couldn\'t find anything, but I\'ve also searched on Google and a few results appeared.

Does SQLite have any built-in Standard Devi

相关标签:
10条回答
  • 2020-12-14 08:44

    You don't state which version of standard deviation you wish to calculate but variances (standard deviation squared) for either version can be calculated using a combination of the sum() and count() aggregate functions.

    select  
    (count(val)*sum(val*val) - (sum(val)*sum(val)))/((count(val)-1)*(count(val))) as sample_variance,
    (count(val)*sum(val*val) - (sum(val)*sum(val)))/((count(val))*(count(val))) as population_variance
    from ... ;
    

    It will still be necessary to take the square root of these to obtain the standard deviation.

    0 讨论(0)
  • 2020-12-14 08:44
    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    #Values produced by this script can be verified by follwing the steps
    #found at https://support.microsoft.com/en-us/kb/213930 to Verify
    #by chosing a non memory based database.
    import sqlite3
    import math
    import random
    import os
    import sys
    import traceback
    import random
    
    class StdevFunc:
        def __init__(self):
            self.M = 0.0    #Mean
            self.V = 0.0    #Used to Calculate Variance
            self.S = 0.0    #Standard Deviation
            self.k = 1      #Population or Small 
    
        def step(self, value):
            try:
                if value is None:
                    return None
    
                tM = self.M
                self.M += (value - tM) / self.k
                self.V += (value - tM) * (value - self.M)
                self.k += 1
            except Exception as EXStep:
                pass
                return None    
    
         def finalize(self):
            try:
                if ((self.k - 1) < 3):
                    return None
    
                #Now with our range Calculated, and Multiplied finish the Variance Calculation
                self.V = (self.V / (self.k-2))
    
                #Standard Deviation is the Square Root of Variance
                self.S = math.sqrt(self.V)
    
                return self.S
            except Exception as EXFinal:
                pass
                return None 
    
    def Histogram(Population):
        try:
            BinCount = 6 
            More = 0
    
            #a = 1          #For testing Trapping
            #b = 0          #and Trace Back
            #c = (a / b)    #with Detailed Info
    
            #If you want to store the Database
            #uncDatabase = os.path.join(os.getcwd(),"BellCurve.db3")
            #con = sqlite3.connect(uncDatabase)
    
            #If you want the database in Memory
            con = sqlite3.connect(':memory:')    
    
            #row_factory allows accessing fields by Row and Col Name
            con.row_factory = sqlite3.Row
    
            #Add our Non Persistent, Runtime Standard Deviation Function to the Database
            con.create_aggregate("Stdev", 1, StdevFunc)
    
            #Lets Grab a Cursor
            cur = con.cursor()
    
            #Lets Initialize some tables, so each run with be clear of previous run
            cur.executescript('drop table if exists MyData;') #executescript requires ; at the end of the string
            cur.execute("create table IF NOT EXISTS MyData('ID' INTEGER PRIMARY KEY   AUTOINCREMENT, 'Val' FLOAT)")
            cur.executescript('drop table if exists Bins;')   #executescript requires ; at the end of the string
            cur.execute("create table IF NOT EXISTS Bins('ID' INTEGER PRIMARY KEY   AUTOINCREMENT, 'Bin' UNSIGNED INTEGER, 'Val' FLOAT, 'Frequency' UNSIGNED BIG INT)")
    
            #Lets generate some random data, and insert in to the Database
            for n in range(0,(Population)):
                sql = "insert into MyData(Val) values ({0})".format(random.uniform(-1,1))
                #If Whole Number Integer greater that value of 2, Range Greater that 1.5
                #sql = "insert into MyData(Val) values ({0})".format(random.randint(-1,1))
                cur.execute(sql)
                pass
    
            #Now let’s calculate some built in Aggregates, that SQLite comes with
            cur.execute("select Avg(Val) from MyData")
            Average = cur.fetchone()[0]
            cur.execute("select Max(Val) from MyData")
            Max = cur.fetchone()[0]
            cur.execute("select Min(Val) from MyData")
            Min = cur.fetchone()[0]
            cur.execute("select Count(Val) from MyData")
            Records = cur.fetchone()[0]
    
            #Now let’s get Standard Deviation using our function that we added
            cur.execute("select Stdev(Val) from MyData")
            Stdev = cur.fetchone()[0]
    
            #And Calculate Range
            Range = float(abs(float(Max)-float(Min)))
    
            if (Stdev == None):
                print("================================   Data Error ===============================")
                print("                 Insufficient Population Size, Or Bad Data.")   
                print("*****************************************************************************")
            elif (abs(Max-Min) == 0):
                print("================================   Data Error ===============================")
                print(" The entire Population Contains Identical values, Distribution Incalculable.")
                print("******************************************************************************")            
            else:  
                Bin = []        #Holds the Bin Values
                Frequency = []  #Holds the Bin Frequency for each Bin
    
                #Establish the 1st Bin, which is based on (Standard Deviation * 3) being subtracted from the Mean
                Bin.append(float((Average - ((3 * Stdev)))))
                Frequency.append(0)
    
                #Establish the remaining Bins, which is basically adding 1 Standard Deviation
                #for each interation, -3, -2, -1, 1, 2, 3             
                for b in range(0,(BinCount) + 1):
                    Bin.append((float(Bin[(b)]) + Stdev))
                    Frequency.append(0)
    
                for b in range(0,(BinCount) + 1):
                    #Lets exploit the Database and have it do the hard work calculating distribution
                    #of all the Bins, with SQL's between operator, but making it left inclusive, right exclusive.
                    sqlBinFreq = "select count(*) as Frequency from MyData where val between {0} and {1} and Val < {2}". \
                                 format(float((Bin[b])), float(Bin[(b + 1)]), float(Bin[(b + 1)]))
    
                    #If the Database Reports Values that fall between the Current Bin, Store the Frequency to a Bins Table. 
                    for rowBinFreq in cur.execute(sqlBinFreq):
                        Frequency[(b + 1)] = rowBinFreq['Frequency']
                        sqlBinFreqInsert = "insert into Bins (Bin, Val, Frequency) values ({0}, {1}, {2})". \
                                       format(b, float(Bin[b]), Frequency[(b)])
                        cur.execute(sqlBinFreqInsert)
    
                    #Allthough this Demo is not likley produce values that
                    #fall outside of Standard Distribution
                   #if this demo was to Calculate with real data, we want to know
                   #how many non-Standard data points we have. 
                   More = (More + Frequency[b])
    
                More = abs((Records - More))
    
                #Add the More value
                sqlBinFreqInsert = "insert into Bins (Bin, Val, Frequency) values ({0}, {1}, {2})". \
                                format((BinCount + 1), float(0), More)
                cur.execute(sqlBinFreqInsert)
    
                #Now Report the Analysis
                print("================================ The Population ==============================")
                print("             {0} {1} {2} {3} {4} {5}". \
                  format("Size".rjust(10, ' '), \
                         "Max".rjust(10, ' '), \
                         "Min".rjust(10, ' '), \
                         "Mean".rjust(10, ' '), \
                         "Range".rjust(10, ' '), \
                         "Stdev".rjust(10, ' ')))
                print("Aggregates:  {0:10d} {1:10.4f} {2:10.4f} {3:10.4f} {4:10.4f} {5:10.4f}". \
                  format(Population, Max, Min, Average, Range, Stdev))
                 print("================================= The Bell Curve =============================")  
    
                LabelString = "{0} {1}  {2}  {3}". \
                          format("Bin".ljust(8, ' '), \
                                 "Ranges".rjust(8, ' '), \
                                 "Frequency".rjust(8, ' '), \
                                 "Histogram".rjust(6, ' '))
    
                print(LabelString)
                print("------------------------------------------------------------------------------")
    
                #Let's Paint a Histogram
                sqlChart = "select * from Bins order by Bin asc"
                for rowChart in cur.execute(sqlChart):
                    if (rowChart['Bin'] == 7):
                        #Bin 7 is not really a bin, but where we place the values that did not fit into the
                        #Normal Distribution. This script was tested against Excel's Bell Curve Example
                        #https://support.microsoft.com/en-us/kb/213930
                        #and produces the same results. Feel free to test it.
                        BinName = "More"
                        ChartString = "{0:<6} {1:<10} {2:10.0f}". \
                                format(BinName, \
                                        "", \
                                        More)
                    else:
                        #Theses are the actual bins where values fall within the distribution.
                        BinName = (rowChart['Bin'] + 1)
                        #Scale the Chart
                        fPercent = ((float(rowChart['Frequency']) / float(Records) * 100))
                        iPrecent = int(math.ceil(fPercent))
    
                        ChartString = "{0:<6} {1:10.4f} {2:10.0f}  {3}". \
                                  format(BinName, \
                                         rowChart['Val'], \
                                         rowChart['Frequency'], \
                                         "".rjust(iPrecent, '#'))
                    print(ChartString)
    
                print("******************************************************************************")
    
                #Commit to Database
                con.commit()
    
                #Clean Up
                cur.close()
                con.close()
    
        except Exception as EXBellCurve:
            pass
            TraceInfo = traceback.format_exc()       
            raise Exception(TraceInfo)  
    
    0 讨论(0)
  • 2020-12-14 08:51

    There is still no built-in stdev function in sqlite. However, you can define (as Alix has done) a user-defined aggregator function. Here is a complete example in Python:

    import sqlite3
    import math
    
    class StdevFunc:
        def __init__(self):
            self.M = 0.0
            self.S = 0.0
            self.k = 1
    
        def step(self, value):
            if value is None:
                return
            tM = self.M
            self.M += (value - tM) / self.k
            self.S += (value - tM) * (value - self.M)
            self.k += 1
    
        def finalize(self):
            if self.k < 3:
                return None
            return math.sqrt(self.S / (self.k-2))
    
    with sqlite3.connect(':memory:') as con:
    
        con.create_aggregate("stdev", 1, StdevFunc)
    
        cur = con.cursor()
    
        cur.execute("create table test(i)")
        cur.executemany("insert into test(i) values (?)", [(1,), (2,), (3,), (4,), (5,)])
        cur.execute("insert into test(i) values (null)")
        cur.execute("select avg(i) from test")
        print("avg: %f" % cur.fetchone()[0])
        cur.execute("select stdev(i) from test")
        print("stdev: %f" % cur.fetchone()[0])
    

    This will print:

    avg: 3.000000
    stdev: 1.581139
    

    Compare with MySQL: http://sqlfiddle.com/#!2/ad42f3/3/0

    0 讨论(0)
  • 2020-12-14 08:52

    added some error detection in the python functions

    class StdevFunc:
        """
        For use as an aggregate function in SQLite
        """
        def __init__(self):
            self.M = 0.0
            self.S = 0.0
            self.k = 0
    
        def step(self, value):
            try:
                # automatically convert text to float, like the rest of SQLite
                val = float(value) # if fails, skips this iteration, which also ignores nulls
                tM = self.M
                self.k += 1
                self.M += ((val - tM) / self.k)
                self.S += ((val - tM) * (val - self.M))
            except:
                pass
    
        def finalize(self):
            if self.k <= 1: # avoid division by zero
                return none
            else:
                return math.sqrt(self.S / (self.k-1))
    
    0 讨论(0)
提交回复
热议问题