Extracting raw data from a PowerPivot model using Python

前端 未结 3 1022
名媛妹妹
名媛妹妹 2021-01-31 20:53

What seemed like a trivial task turned into a real nightmare when I had to read in some data from a PowerPivot model using Python. I believe I\'ve researched this very well over

3条回答
  •  孤独总比滥情好
    2021-01-31 20:59

    Lo and behold, I finally managed to crack the problem - turns out that accessing Power Pivot data using Python is indeed possible! Below's a short recap of what I did - you can find a more detailed description here: Analysis Services (SSAS) on a shoestring. Note: the code has been optimized neither for efficiency nor elegance.

    • Install Microsoft Power BI Desktop (comes with free Analysis Services server, so no need for a costly SQL Server license - however, the same approach obviously also works if you have a proper license).
    • Fire up the AS engine by first creating the msmdsrv.ini settings file, then restore the database from the ABF file (using AMO.NET), then extract data using ADOMD.NET.

    Here's the Python code that illustrates the AS engine + AMO.NET parts:

    import psutil, subprocess, random, os, zipfile, shutil, clr, sys, pandas
    
    def initialSetup(pathPowerBI):
        sys.path.append(pathPowerBI)
    
        #required Analysis Services assemblies
        clr.AddReference("Microsoft.PowerBI.Amo.Core")
        clr.AddReference("Microsoft.PowerBI.Amo")     
        clr.AddReference("Microsoft.PowerBI.AdomdClient")
    
        global AMO, ADOMD
        import Microsoft.AnalysisServices as AMO
        import Microsoft.AnalysisServices.AdomdClient as ADOMD
    
    def restorePowerPivot(excelName, pathTarget, port, pathPowerBI):   
        #create random folder
        os.chdir(pathTarget)
        folder = os.getcwd()+str(random.randrange(10**6, 10**7))
        os.mkdir(folder)
    
        #extract PowerPivot model (abf backup)
        archive = zipfile.ZipFile(excelName)
        for member in archive.namelist():
            if ".data" in member:
                filename = os.path.basename(member)
                abfname = os.path.join(folder, filename) + ".abf"
                source = archive.open(member)
                target = file(os.path.join(folder, abfname), 'wb')
                shutil.copyfileobj(source, target)
                del target
        archive.close()
    
        #start the cmd.exe process to get its PID
        listPIDpre = [proc for proc in psutil.process_iter()]
        process = subprocess.Popen('cmd.exe /k', stdin=subprocess.PIPE)
        listPIDpost = [proc for proc in psutil.process_iter()]
        pid = [proc for proc in listPIDpost if proc not in listPIDpre if "cmd.exe" in str(proc)][0]
        pid = str(pid).split("=")[1].split(",")[0]
    
        #msmdsrv.ini
        msmdsrvText = '''
           {0}
           {0}
           {0}
           {0}
           2
           1
           0
           1
           1
           
              
                 1
                 1
              
              
                 1
                 1
                 9
              
              1
           
           {1}
           {2}
           0
           1033
           
              0
           
           
              
                 {0}
              
              
                 0
              
           
           {0}
           
              0
           
           
              1
           
           
              0
              1
           
        '''
    
        #save ini file to disk, fill it with required parameters
        msmdsrvini = open(folder+"\\msmdsrv.ini", "w")
        msmdsrvText = msmdsrvText.format(folder, port, pid) #{0},{1},{2}
        msmdsrvini.write(msmdsrvText)
        msmdsrvini.close()
    
        #run AS engine inside the cmd.exe process
        initString = "\"{0}\\msmdsrv.exe\" -c -s \"{1}\""
        initString = initString.format(pathPowerBI.replace("/","\\"),folder)
        process.stdin.write(initString + " \n")
    
        #connect to the AS instance from Python
        AMOServer = AMO.Server()
        AMOServer.Connect("localhost:{0}".format(port))
    
        #restore database from PowerPivot abf backup, disconnect
        AMORestoreInfo = AMO.RestoreInfo(os.path.join(folder, abfname))
        AMOServer.Restore(AMORestoreInfo)
        AMOServer.Disconnect()
    
        return process
    

    And the data-extraction part:

    def runQuery(query, port, flag):
        #ADOMD assembly
        ADOMDConn = ADOMD.AdomdConnection("Data Source=localhost:{0}".format(port))
        ADOMDConn.Open()
        ADOMDCommand = ADOMDConn.CreateCommand() 
        ADOMDCommand.CommandText = query
    
        #read data in via AdomdDataReader object
        DataReader = ADOMDCommand.ExecuteReader()
    
        #get metadata, number of columns
        SchemaTable = DataReader.GetSchemaTable()
        numCol = SchemaTable.Rows.Count #same as DataReader.FieldCount
    
        #get column names
        columnNames = []
        for i in range(numCol):
            columnNames.append(str(SchemaTable.Rows[i][0]))
    
        #fill with data
        data = []
        while DataReader.Read()==True:
            row = []
            for j in range(numCol):
                try:
                    row.append(DataReader[j].ToString())
                except:
                    row.append(DataReader[j])
            data.append(row)
        df = pandas.DataFrame(data)
        df.columns = columnNames 
    
        if flag==0:
            DataReader.Close()
            ADOMDConn.Close()
    
            return df     
        else:   
            #metadata table
            metadataColumnNames = []
            for j in range(SchemaTable.Columns.Count):
                metadataColumnNames.append(SchemaTable.Columns[j].ToString())
            metadata = []
            for i in range(numCol):
                row = []
                for j in range(SchemaTable.Columns.Count):
                    try:
                        row.append(SchemaTable.Rows[i][j].ToString())
                    except:
                        row.append(SchemaTable.Rows[i][j])
                metadata.append(row)
            metadf = pandas.DataFrame(metadata)
            metadf.columns = metadataColumnNames
    
            DataReader.Close()
            ADOMDConn.Close()
    
            return df, metadf
    

    The raw data are then extracted via something like this:

    pathPowerBI = "C:/Program Files/Microsoft Power BI Desktop/bin"
    initialSetup(pathPowerBI)
    session = restorePowerPivot("D:/Downloads/PowerPivotTutorialSample.xlsx", "D:/", 60000, pathPowerBI)
    df, metadf = runQuery("EVALUATE dbo_DimProduct", 60000, 1)
    endSession(session)
    

提交回复
热议问题