问题
I am querying a research publication database. Articles have different number of authors (ranging from 1 to more than 20). My goal is to create an edge list of co-authors for social network analysis using iGraph. Below is a snippet of json
{
"format": "linked-data-api",
"version": "0.2",
"result": {
"_about": "http://network.csiro.au:9500/standalone/publications.json?_pageSize=5&_page=1",
"definition": "http://network.csiro.au:9500/standalone/meta/publications.json",
"extendedMetadataVersion": "http://network.csiro.au:9500/standalone/publications.json?_pageSize=5&_page=1&_metadata=all",
"first": "http://network.csiro.au:9500/standalone/publications.json?_page=0",
"isPartOf": {
"_about": "http://network.csiro.au:9500/standalone/publications.json",
"definition": "http://network.csiro.au:9500/standalone/meta/publications.json",
"hasPart": "http://network.csiro.au:9500/standalone/publications.json?_pageSize=5&_page=1",
"type": [
"http://purl.org/linked-data/api/vocab#ListEndpoint"
]
},
"items": [
{
"_about": "http://network.csiro.au/data/pub_EP1312922",
"access": "Public",
"author": {
"_about": "http://network.csiro.au/data/aimee.slangen",
"hasName": {
"_about": "http://network.csiro.au/data/aimee.slangen_name",
"firstName": "Aimee",
"lastName": "Slangen",
"title": "Ms"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP1312922_author_0",
"author": {
"_about": "http://network.csiro.au/data/aimee.slangen",
"hasName": {
"_about": "http://network.csiro.au/data/aimee.slangen_name",
"firstName": "Aimee",
"lastName": "Slangen",
"title": "Ms"
}
},
"sequenceNumber": 0
},
"classification": {
"_about": "http://network.csiro.au/data/classification_code_040104",
"name": "Climate Change Processes"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"journalTitle": "Journal of Geophysical Research-Oceans",
"keyword": " ",
"outcome": "Approved",
"pages": "156-164",
"project": "http://network.csiro.au/data/project_PD00003609",
"publicationVolume": "119",
"publishedDate": "9-Jan-2014",
"publisher": "American Geophysical Union",
"title": "Regional Differences of Relative Sea Level Changes in the Northwest Atlantic: Historical Trends and Future Projections",
"wbscode": "R-03426-01-003",
"yearOfPublication": "2014"
},
{
"_about": "http://network.csiro.au/data/pub_EP112347",
"access": "Public",
"author": {
"_about": "http://network.csiro.au/data/roland.pitcher",
"hasName": {
"_about": "http://network.csiro.au/data/roland.pitcher_name",
"firstName": "Roland",
"lastName": "Pitcher",
"title": "Dr"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP112347_author_0",
"author": {
"_about": "http://network.csiro.au/data/roland.pitcher",
"hasName": {
"_about": "http://network.csiro.au/data/roland.pitcher_name",
"firstName": "Roland",
"lastName": "Pitcher",
"title": "Dr"
}
},
"sequenceNumber": 0
},
"classification": {
"_about": "http://network.csiro.au/data/classification_code_050209",
"name": "Natural Resource Management"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"keyword": " ",
"outcome": "Approved",
"project": "http://network.csiro.au/data/project_PD00000752",
"publisher": "Queensland Department of Environment and Resource Management",
"title": "Understanding and Managing the Effects of Trawling on the Seabed in the Great Barrier Reef",
"wbscode": "R-00654-03-003",
"yearOfPublication": " "
},
{
"_about": "http://network.csiro.au/data/pub_EP148991",
"access": "CSIRO Only",
"author": {
"_about": "http://network.csiro.au/data/rob.bramley",
"hasName": {
"_about": "http://network.csiro.au/data/rob.bramley_name",
"firstName": "Rob",
"lastName": "Bramley",
"title": "Dr"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP148991_author_0",
"author": {
"_about": "http://network.csiro.au/data/rob.bramley",
"hasName": {
"_about": "http://network.csiro.au/data/rob.bramley_name",
"firstName": "Rob",
"lastName": "Bramley",
"title": "Dr"
}
},
"sequenceNumber": 0
},
"classification": {
"_about": "http://network.csiro.au/data/classification_code_070107",
"name": "Farming Systems Research"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"keyword": " ",
"outcome": "Approved",
"pages": "26 + appendices",
"project": "http://network.csiro.au/data/project_PD00002886",
"publishedDate": "17-Sep-2014",
"publisher": "SRA",
"title": "A collaborative approach to Precision Agriculture RDE for the Australian Sugar Industry",
"wbscode": "R-02709-01",
"yearOfPublication": "2014"
},
{
"_about": "http://network.csiro.au/data/pub_EP151976",
"access": "Public",
"author": {
"_about": "http://network.csiro.au/data/paul.krummel",
"hasName": {
"_about": "http://network.csiro.au/data/paul.krummel_name",
"firstName": "Paul",
"lastName": "Krummel",
"title": "Mr"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP151976_author_0",
"author": {
"_about": "http://network.csiro.au/data/paul.krummel",
"hasName": {
"_about": "http://network.csiro.au/data/paul.krummel_name",
"firstName": "Paul",
"lastName": "Krummel",
"title": "Mr"
}
},
"sequenceNumber": 0
},
"classification": [
{
"_about": "http://network.csiro.au/data/classification_code_040104",
"name": "Climate Change Processes"
},
{
"_about": "http://network.csiro.au/data/classification_code_040199",
"name": "Atmospheric Sciences not elsewhere classified"
}
],
"classificationLevel": "http://network.csiro.au/data/unclassified",
"journalTitle": "Atmospheric Chemistry and Physics",
"keyword": [
"CH4",
"OH",
"hydroxyl radical",
"methane"
],
"outcome": "Approved",
"pages": "7943\u20137956",
"project": "http://network.csiro.au/data/project_PD00009165",
"publicationVolume": "16",
"publishedDate": "30-Jun-2016",
"publisher": "Copernicus GmbH",
"title": "Role of OH variability in the stalling of the global atmospheric CH4 growth rate from 1999 to 2006",
"wbscode": "R-07848; R-06420; R-07768",
"yearOfPublication": "2016"
},
{
"_about": "http://network.csiro.au/data/pub_EP152677",
"access": "CSIRO Only",
"author": [
{
"_about": "http://network.csiro.au/data/andrew.george",
"hasName": {
"_about": "http://network.csiro.au/data/andrew.george_name",
"firstName": "Andrew",
"lastName": "George",
"title": "Dr"
}
},
{
"_about": "http://network.csiro.au/data/sigrid.lehnert",
"hasName": {
"_about": "http://network.csiro.au/data/sigrid.lehnert_name",
"firstName": "Sigrid",
"lastName": "Lehnert",
"title": "Dr"
}
},
{
"_about": "http://network.csiro.au/data/toni.reverter-gomez",
"hasName": {
"_about": "http://network.csiro.au/data/toni.reverter-gomez_name",
"firstName": "Toni",
"lastName": "Reverter-Gomez",
"title": "Dr"
}
},
{
"_about": "http://network.csiro.au/data/yutao.li",
"hasName": {
"_about": "http://network.csiro.au/data/yutao.li_name",
"firstName": "Yutao",
"lastName": "Li",
"title": "Dr"
}
}
],
"authorSeq": [
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_0",
"author": {
"_about": "http://network.csiro.au/data/yutao.li",
"hasName": {
"_about": "http://network.csiro.au/data/yutao.li_name",
"firstName": "Yutao",
"lastName": "Li",
"title": "Dr"
}
},
"sequenceNumber": 0
},
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_1",
"author": {
"_about": "http://network.csiro.au/data/andrew.george",
"hasName": {
"_about": "http://network.csiro.au/data/andrew.george_name",
"firstName": "Andrew",
"lastName": "George",
"title": "Dr"
}
},
"sequenceNumber": 1
},
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_2",
"author": {
"_about": "http://network.csiro.au/data/sigrid.lehnert",
"hasName": {
"_about": "http://network.csiro.au/data/sigrid.lehnert_name",
"firstName": "Sigrid",
"lastName": "Lehnert",
"title": "Dr"
}
},
"sequenceNumber": 2
},
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_3",
"author": {
"_about": "http://network.csiro.au/data/toni.reverter-gomez",
"hasName": {
"_about": "http://network.csiro.au/data/toni.reverter-gomez_name",
"firstName": "Toni",
"lastName": "Reverter-Gomez",
"title": "Dr"
}
},
"sequenceNumber": 3
}
],
"classification": {
"_about": "http://network.csiro.au/data/classification_code_070201",
"name": "Animal Breeding"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"conferenceDate": "28th-30th September 2015",
"conferenceLocation": "Lorne, Victoria",
"conferenceName": "21st AAABG",
"keyword": " ",
"outcome": "Approved",
"pages": "433-436",
"project": "http://network.csiro.au/data/project_PD00005603",
"publicationVolume": "21",
"publishedDate": "25-Sep-2015",
"publisher": "Association for the Advancement of Animal Breeding and Genetics",
"title": "Using Random Forests to Identify SNP Associated With Leg Defect in Broiler Chicken: Impact of Correcting For Population Structures",
"wbscode": "R-05156",
"yearOfPublication": "2015"
}
],
"itemsPerPage": 5,
"next": "http://network.csiro.au:9500/standalone/publications.json?_page=2",
"page": 1,
"prev": "http://network.csiro.au:9500/standalone/publications.json?_page=0",
"startIndex": 6,
"totalResults": 47023,
"type": [
"http://purl.org/linked-data/api/vocab#Page"
]
}
}
I am read the data in as follows:
library(jsonlite)
library(tidyjson)
pubs <- fromJSON("http://network.csiro.au:9500/standalone/publications.json?_page=1&_pageSize=5")
When trying to extract meaningful data using tidyjson, I get this error:
pubs %>%
as.tbl_json %>%
enter_object("items")
Error in UseMethod("as.tbl_json") :
no applicable method for 'as.tbl_json' applied to an object of class "list"
I am not an expert in R or JSON so would appreciate some guidance. Using the above example, I want to create for each publication an edge list of co-authors like this:
_about yearOfPublication from to
http://network.url.com/data/pub_EP16079 2011 Colin Jackson Holly Trueman
http://network.url.com/data/pub_EP16079 2011 Colin Jackson Tara Sutherland
http://network.url.com/data/pub_EP16079 2011 Colin Jackson Trevor Rapson
http://network.url.com/data/pub_EP16079 2011 Holly Trueman Tara Sutherland
http://network.url.com/data/pub_EP16079 2011 Holly Trueman Trevor Rapson
http://network.url.com/data/pub_EP16079 2011 Tara Sutherland Trevor Rapson
I hope someone can help me! Thanks in advance.
回答1:
This is a bit of a tricky example. See this issue for discussion on how to improve how tidyjson
handles objects that are sometimes arrays.
While not the cleanest solution, I think this does get the job done - you could probably functionalize some of these groups of steps to optimize code-reuse.
The basic aim is to parse enough of the object to get to the authors, then use a separate work-flow for objects and arrays. The arrays require tidyr::expand
to complete the combinations of all authors (since those combinations are not represented in the data)
json <- paste(readLines("ex.json"), collapse = " ")
library(dplyr)
library(tidyjson)
library(tidyr)
## parse the objects. Notice some publications have objects representing a
## single author, others have an array of many authors
prep <- json %>%
enter_object("result") %>%
enter_object("items") %>%
gather_array() %>%
spread_values(
about = jstring("_about")
, yearOfPublication = jstring(yearOfPublication)
) %>%
enter_object("author") %>%
json_types()
## parse object types
authorobj <- prep %>%
filter(as.character(type) == "object") %>%
spread_values(
authorFirst = jstring(hasName, firstName)
, authorLast = jstring(hasName, lastName)
) %>%
mutate(from = paste(authorFirst, authorLast), to = from) %>%
select(-authorFirst, -authorLast) %>%
tbl_df()
## parse array types - get 'from' authors
authorarr <- prep %>%
filter(as.character(type) == "array") %>%
gather_array("authorid") %>%
spread_values(
authorFirst = jstring(hasName, firstName)
, authorLast = jstring(hasName, lastName)
) %>%
mutate(from = paste(authorFirst, authorLast)) %>%
select(-authorFirst, -authorLast)
## use tidyr::expand to complete combinations of from/to
authorarr <- authorarr %>%
tbl_df() %>%
left_join(
authorarr %>%
group_by(array.index) %>%
expand(from = authorarr$from, to = authorarr$from) %>%
ungroup()
, by = c("array.index", "from"))
## stack (select only a few columns for display)
dplyr::bind_rows(authorobj, authorarr) %>%
select(array.index, from, to)
#> # A tibble: 20 x 3
#> array.index from to
#> <int> <chr> <chr>
#> 1 1 Aimee Slangen Aimee Slangen
#> 2 2 Roland Pitcher Roland Pitcher
#> 3 3 Rob Bramley Rob Bramley
#> 4 4 Paul Krummel Paul Krummel
#> 5 5 Andrew George Andrew George
#> 6 5 Andrew George Sigrid Lehnert
#> 7 5 Andrew George Toni Reverter-Gomez
#> 8 5 Andrew George Yutao Li
#> 9 5 Sigrid Lehnert Andrew George
#> 10 5 Sigrid Lehnert Sigrid Lehnert
#> 11 5 Sigrid Lehnert Toni Reverter-Gomez
#> 12 5 Sigrid Lehnert Yutao Li
#> 13 5 Toni Reverter-Gomez Andrew George
#> 14 5 Toni Reverter-Gomez Sigrid Lehnert
#> 15 5 Toni Reverter-Gomez Toni Reverter-Gomez
#> 16 5 Toni Reverter-Gomez Yutao Li
#> 17 5 Yutao Li Andrew George
#> 18 5 Yutao Li Sigrid Lehnert
#> 19 5 Yutao Li Toni Reverter-Gomez
#> 20 5 Yutao Li Yutao Li
来源:https://stackoverflow.com/questions/40878226/extracting-data-from-nested-json-document-using-tidyjson-in-r