Labelbox has two ways of exporting data:
- Label by Label: Good for real-time usecases or exporting a subset of the data
- Bulk export: Good for workflows that need require accessing all of the labels in a project
Get the projectId you want to export from.
query MyProjects {
user {
projects {
id
name
}
}
}
Using the project ID from the above query you can pull all the labels created in that project.
query APIGetPageOfLabels {
project(where:{id: "<INSERT_PROJECT_ID_HERE>"}) {
labels(first: 5){
id
label
createdBy{
id
email
}
type {
id
name
}
secondsToLabel
agreement
dataRow {
id
rowData
}
}
}
}
In order to get all label pages you can use the below code.
import json
from graphqlclient import GraphQLClient
client = GraphQLClient('https://api.labelbox.com/graphql')
client.inject_token('Bearer <API_KEY_HERE>')
def get_page_of_labels(project_id, skip, page_size):
page_query = '''
query APIGetPageOfLabels($projectId: ID!, $skip: Int!, $first: PageSize!) {
project(where:{id: $projectId}) {
labels(skip: $skip, first: $first){
id
label
createdBy{
id
email
}
type {
id
name
}
secondsToLabel
agreement
dataRow {
id
rowData
}
}
}
}
'''
res = client.execute(page_query, {'projectId': project_id, 'skip': skip, 'first': page_size})
data = json.loads(res)['data']
return data['project']['labels'] or []
def get_all_labels(project_id, skip = 0, page_size = 100, all_labels = []):
new_labels = get_page_of_labels(project_id, skip, page_size)
if len(new_labels) == page_size:
return get_all_labels(project_id, skip + page_size, page_size, all_labels + new_labels)
else:
return all_labels + new_labels
project_id = '<PROJECT_ID_HERE>'
all_labels = get_all_labels(project_id)
print(len(all_labels))
Here are some common filter you might want to apply to your export
query APIGetPageOfLabels{
project(where:{id: "<INSERT_PROJECT_ID_HERE>"}) {
labels(
first: 5,
where: {
createdBy:{id:"USER_ID"},
createdAt_lt:"2018-04-24T22:09:21.753Z",
createdAt_gt:"2018-04-01T22:09:21.753Z",
type:{id:"TYPE_ID"}, # Skipped vs Submitted
agreement_gt:0,
agreement_lt:0.5,
dataRow:{
dataset:{
id:"SOME_DATASET_ID"
}
}
}
){
id
label
}
}
}
This bulk export call will return a url to a JSON file containing the labels for the project. The format of the JSON file is the same as the export format found in the U.I.
If the mutation returns "shouldPoll: true" then the export is in the process of being generated and your script should make the same request on an interval until completion. See the python end to end example.
30min Frequency
The exportLabels mutation will only generate a new downloadUrl at a max frequency of 30min. If you called exportLabels twice in a short time period the second call would return the same downloadUrl and createdAt timestamp.
mutation{
exportLabels(data:{
projectId:"<INSERT_PROJECT_ID_HERE>"
}){
downloadUrl
createdAt
shouldPoll
}
}
Python End to End Example
# Two things to run this script
# 1. run "pip install graphqlclient"
# 2. Fill in <API-KEY-HERE> (https://app.labelbox.com/settings/apikey)
import json
import time
import urllib.request, json
from graphqlclient import GraphQLClient
client = GraphQLClient('https://api.labelbox.com/graphql')
client.inject_token('Bearer <API-KEY-HERE>')
def get_export_url(project_id):
res_str = client.execute("""
mutation GetExportUrl($project_id: ID!){
exportLabels(data:{
projectId: $project_id
}){
downloadUrl
createdAt
shouldPoll
}
}
""", {'project_id': project_id})
res = json.loads(res_str)
return res['data']['exportLabels']
def get_project_labels(project_id):
export_job = get_export_url(project_id)
if (export_job['shouldPoll']):
print('Export Generating...')
time.sleep(3)
return get_project_labels(project_id)
with urllib.request.urlopen(export_job['downloadUrl']) as url:
labels = json.loads(url.read().decode())
return labels
def get_projects():
res_str = client.execute("""
query GetAProjectFromOrganization {
projects {
id
name
}
}
""")
res = json.loads(res_str)
return res['data']['projects']
if __name__ == "__main__":
project_id = get_projects()[0]['id']
print(get_project_labels(project_id))