Load large dataset into crossfilter/dc.js

前端 未结 3 639
暖寄归人
暖寄归人 2021-02-01 10:09

I built a crossfilter with several dimensions and groups to display the data visually using dc.js. The data visualized is bike trip data, and each trip will be loaded in. Right

相关标签:
3条回答
  • 2021-02-01 10:35

    I've observed similar issues with data (working in enterprise company), I found couple of ideas worth trying.

    1. your data have regular structure, so you can put keys in first row, and only data in following rows - imitating CSV (header first, data next)
    2. Date Time can be changed to epoch number (and you can move start of epoch to 01/01/2015 and calculate when received
    3. use oboe.js when getting response from server (http://oboejs.com/), as data-set will be large, consider using oboe.drop during load
    4. update visualization with JavaScript timer

    timer sample

    var datacnt=0;
    var timerId=setInterval(function () {
        // body...
        d3.select("#count-data-current").text(datacnt);
        //update visualization should go here, something like dc.redrawAll()...
    },300);
    
    oboe("relative-or-absolute path to your data(ajax)")
    .node('CNT',function (count,path) {
        // body...
        d3.select("#count-data-all").text("Expecting " + count + " records");
        return oboe.drop;
    })
    .node('data.*', function (record, path) {
        // body...
        datacnt++;
        return oboe.drop;
    })
    .node('done', function (item, path) {
        // body...
        d3.select("#progress-data").text("all data loaded");
        clearTimeout(timerId);
        d3.select("#count-data-current").text(datacnt);
    });
    

    data sample

    {"CNT":107498, 
     "keys": "DATACENTER","FQDN","VALUE","CONSISTENCY_RESULT","FIRST_REC_DATE","LAST_REC_DATE","ACTIVE","OBJECT_ID","OBJECT_TYPE","CONSISTENCY_MESSAGE","ID_PARAMETER"], 
     "data": [[22,202,"4.9.416.2",0,1449655898,1453867824,-1,"","",0,45],[22,570,"4.9.416.2",0,1449655912,1453867884,-1,"","",0,45],[14,377,"2.102.453.0",-1,1449654863,1468208273,-1,"","",0,45],[14,406,"2.102.453.0",-1,1449654943,1468208477,-1,"","",0,45],[22,202,"10.2.293.0",0,1449655898,1453867824,-1,"","",0,8],[22,381,"10.2.293.0",0,1449655906,1453867875,-1,"","",0,8],[22,570,"10.2.293.0",0,1449655912,1453867884,-1,"","",0,8],[22,381,"1.80",0,1449655906,1453867875,-1,"","",0,41],[22,570,"1.80",0,1449655912,1453867885,-1,"","",0,41],[22,202,"4",0,1449655898,1453867824,-1,"","",0,60],[22,381,"4",0,1449655906,1453867875,-1,"","",0,60],[22,570,"4",0,1449655913,1453867885,-1,"","",0,60],[22,202,"A20",0,1449655898,1453867824,-1,"","",0,52],[22,381,"A20",0,1449655906,1453867875,-1,"","",0,52],[22,570,"A20",0,1449655912,1453867884,-1,"","",0,52],[22,202,"20140201",2,1449655898,1453867824,-1,"","",0,40],[22,381,"20140201",2,1449655906,1453867875,-1,"","",0,40],[22,570,"20140201",2,1449655912,1453867884,-1,"","",0,40],[22,202,"16",-4,1449655898,1453867824,-1,"","",0,58],[22,381,"16",-4,1449655906,1453867875,-1,"","",0,58],[22,570,"16",-4,1449655913,1453867885,-1,"","",0,58],[22,202,"512",0,1449655898,1453867824,-1,"","",0,57],[22,381,"512",0,1449655906,1453867875,-1,"","",0,57],[22,570,"512",0,1449655913,1453867885,-1,"","",0,57],[22,930,"I32",0,1449656143,1461122271,-1,"","",0,66],[22,930,"20140803",-4,1449656143,1461122271,-1,"","",0,64],[14,1359,"10.2.340.19",0,1449655203,1468209257,-1,"","",0,131],[14,567,"10.2.340.19",0,1449655185,1468209111,-1,"","",0,131],[22,930,"4.9.416.0",-1,1449656143,1461122271,-1,"","",0,131],[14,1359,"10.2.293.0",0,1449655203,1468209258,-1,"","",0,13],[14,567,"10.2.293.0",0,1449655185,1468209112,-1,"","",0,13],[22,930,"4.9.288.0",-1,1449656143,1461122271,-1,"","",0,13],[22,930,"4",0,1449656143,1461122271,-1,"","",0,76],[22,930,"96",0,1449656143,1461122271,-1,"","",0,77],[22,930,"4",0,1449656143,1461122271,-1,"","",0,74],[22,930,"VMware ESXi 5.1.0 build-2323236",0,1449656143,1461122271,-1,"","",0,17],[21,616,"A20",0,1449073850,1449073850,-1,"","",0,135],[21,616,"4",0,1449073850,1449073850,-1,"","",0,139],[21,616,"12",0,1449073850,1449073850,-1,"","",0,138],[21,616,"4",0,1449073850,1449073850,-1,"","",0,140],[21,616,"2",0,1449073850,1449073850,-1,"","",0,136],[21,616,"512",0,1449073850,1449073850,-1,"","",0,141],[21,616,"Microsoft Windows Server 2012 R2 Datacenter",0,1449073850,1449073850,-1,"","",0,109],[21,616,"4.4.5.100",0,1449073850,1449073850,-1,"","",0,97],[21,616,"3.2.7895.0",-1,1449073850,1449073850,-1,"","",0,56],[9,2029,"10.7.220.6",-4,1470362743,1478315637,1,"vmnic0","",1,8],[9,1918,"10.7.220.6",-4,1470362728,1478315616,1,"vmnic3","",1,8],[9,1918,"10.7.220.6",-4,1470362727,1478315616,1,"vmnic2","",1,8],[9,1918,"10.7.220.6",-4,1470362727,1478315615,1,"vmnic1","",1,8],[9,1918,"10.7.220.6",-4,1470362727,1478315615,1,"vmnic0","",1,8],[14,205,"934.5.45.0-1vmw",-50,1465996556,1468209226,-1,"","",0,47],[14,1155,"934.5.45.0-1vmw",-50,1465996090,1468208653,-1,"","",0,14],[14,963,"934.5.45.0-1vmw",-50,1465995972,1468208526,-1,"","",0,14],
     "done" : true}
    

    sample of changing keys first to full array of objects

        //function to convert main data to array of objects
        function convertToArrayOfObjects(data) {
            var keys = data.shift(),
                i = 0, k = 0,
                obj = null,
                output = [];
    
            for (i = 0; i < data.length; i++) {
                obj = {};
    
                for (k = 0; k < keys.length; k++) {
                    obj[keys[k]] = data[i][k];
                }
    
                output.push(obj);
            }
    
            return output;
        }
    

    this function above works with a bit modified version of data sample here

       [["ID1","ID2","TEXT1","STATE1","DATE1","DATE2","STATE2","TEXT2","TEXT3","ID3"],
        [14,377,"2.102.453.0",-1,1449654863,1468208273,-1,"","",0,45],
        [14,406,"2.102.453.0",-1,1449654943,1468208477,-1,"","",0,45],
        [22,202,"10.2.293.0",0,1449655898,1453867824,-1,"","",0,8],
        [22,381,"10.2.293.0",0,1449655906,1453867875,-1,"","",0,8],
        [22,570,"10.2.293.0",0,1449655912,1453867884,-1,"","",0,8],
        [22,381,"1.80",0,1449655906,1453867875,-1,"","",0,41],
        [22,570,"1.80",0,1449655912,1453867885,-1,"","",0,41],
        [22,202,"4",0,1449655898,1453867824,-1,"","",0,60],
        [22,381,"4",0,1449655906,1453867875,-1,"","",0,60],
        [22,570,"4",0,1449655913,1453867885,-1,"","",0,60],
        [22,202,"A20",0,1449655898,1453867824,-1,"","",0,52]]
    

    Also consider using memcached https://memcached.org/ or redis https://redis.io/ to cache data at server side, as per data size, redis might get you further

    0 讨论(0)
  • 2021-02-01 10:41

    Consider my class design. It doesn't match yours but it illustrates my points.

    public class MyDataModel
    {
        public List<MyDatum> Data { get; set; }
    }
    
    public class MyDatum
    {
        public long StartDate { get; set; }
        public long EndDate { get; set; }
        public int Duration { get; set; }
        public string Title { get; set; }
    }
    

    The start and end dates are Unix timestamps and the duration is in seconds.

    Serializes to: "{"Data":
    [{"StartDate":1441256019,"EndDate":1441257181, "Duration":451,"Title":"Rad is a cool word."}, ...]}"

    One row of datum is 92 chars.

    Let's start compressing! Convert dates and times to base 60 strings. Store everything in an array of an array of strings.

    public class MyDataModel
    {
        public List<List<string>> Data { get; set; }
    }
    

    Serializes to: "{"Data":[["1pCSrd","1pCTD1","7V","Rad is a cool word."],...]}"

    One row of datum is now 47 chars. moment.js is a good library for working with dates and time. It has functions built in to unpack the base 60 format.

    Working with an array of arrays will make your code less readable so add comments to document the code.

    Load just the most recent 90 days. Zoom to 30 days. When the user drags the brush on the range chart left start fetching more data in chunks of 90 days until the user stops dragging. Add the data to the existing crossfilter using the add method.

    As you add more and more data you will notice that your charts get less and less responsive. That is because you have rendered hundreds or even thousands of elements in your svg. The browser is getting crushed. Use the d3 quantize function to group data points into buckets. Reduce the displayed data to 50 buckets.

    Quantizing is worth the effort and is the only way you can create a scalable graph with a continuously growing dataset.

    Your other option is to abandon the range chart and group the data month over month, day over day, and hour over hour. Then add a date range picker. Since your data would be grouped by month, day, and hour you'll find that even if you rode your bike every hour of the day you'd never have a result set larger than 8766 rows.

    0 讨论(0)
  • 2021-02-01 10:55

    I'd recommend shortening all of your field names in the JSON to 1 character (including "start_date" and "start_time"). That should help a little bit. Also, make sure that compression is turned on on your server. That way the data sent to the browser will be automatically compressed in transit, which should speed things up a ton if it's not already turned on.

    For better responsiveness, I'd also recommend first setting up your Crossfilter (empty), all your dimensions and groups, and all your dc.js charts, then using Crossfilter.add() to add more data into your Crossfilter in chunks. The easiest way to do this is to divide your data up into bite-sized chunks (a few MBs each) and load them serially. So if you are using d3.json, then start the next file load in the callback of the previous file load. This results in a bunch of nested callbacks, which is a bit nasty, but should allow the user interface to be responsive while the data is loading.

    Lastly, with this much data I believe you will start running into performance issues in the browser, not just while loading the data. I suspect you are already seeing this and that the 15 second pause you are seeing is at least partially in the browser. You can check by profiling in your browser's developer tools. To address this, you'll want to profile and identify performance bottlenecks, then try to optimize those. Also - be sure to test on slower computers if they are in your audience.

    0 讨论(0)
提交回复
热议问题