Aggregate and export data as JSON or CSV in Mongo Shell (mongosh)
Fed up of copy-pasting the output of graphical MongoDB clients for data? Want to pre-process raw data before export? This one’s for you =)
The idea here is to use the --file
mongosh flag to run a script written in JavaScript (JS). Since it is JS, it can use established npm packages. To keep things simple, this example will only use Node.js modules to avoiding having node_modules
Let’s say we have created a Mongo Atlas database with sample data as shown here. It has a sample_restaurants
database containing a restaurants
Collection. Each Document in this Collection looks something like this:
Now picture an analyst looking for the next top Italian restaurant. The questions that pop up would be:
- Which boroughs contain an Italian restaurant?
- I’d like to limit the number of results so that it won’t be hard to read
- I want it in a format that I can easily handle (JSON, CSV)
This script below does that:
const fs = require('fs');
/**
* Script to query and export data from db as either JSON or CSV
* Run command from shell. Path is relative to terminal:
* mongosh --file "./mongo-db-export.script.js"
* Transforms are entirely up to you =)
* Tested with mongosh v1.6.1
*/
//###################################################################//
// Script configs. Set this way since mongosh cannot pass in command args
const scriptName = 'exportMongoDbToFile';
const availableExportFormats = {
json: 'json',
csv: 'csv',
};
const exportFormat = availableExportFormats.csv;
const exportFilePath = './temp/output'; // Don't include file format!
const connectionString = `mongodb+srv://username:password@mongo-hostname.
mongodb.net`;
const dbName = `sample_restaurants`;
const collectionName = 'restaurants';
const aggregationQuery = [
{$match: {cuisine: 'Italian'}},
{$project: {name: 1, cuisine: 1, borough: 1, address: 1}},
{$limit: 5},
];
//###################################################################//
// mongosh global
db = connect(`${connectionString}/${dbName}`);
const exportToJson = (results) => {
const stream = fs.createWriteStream(`${exportFilePath}.json`);
stream.once('open', function () {
stream.write('[' + '\n');
for (let i = 0; i < results.length; ++i) {
if (i === results.length - 1) {
stream.write(JSON.stringify(results[i]) + '\n');
continue;
}
stream.write(JSON.stringify(results[i]) + ',\n');
}
stream.write(']' + '\n');
stream.end();
});
};
const exportToCsv = async (results) => {
const headers = [];
for (const result of results) {
Object.keys(result).forEach((key) => {
if (!headers.includes(key)) {
headers.push(key);
}
});
}
console.log(`exportToCsv: Headers accumulated: ${headers.join(',')}`);
const stream = fs.createWriteStream(`${exportFilePath}.csv`);
stream.once('open', function () {
stream.write(headers.join(',') + '\n');
for (const result of results) {
const row = new Array(headers.length).map((item) => '');
// Match obj.key to same index in header
Object.keys(result).forEach((key) => {
const keyPosInHeader = headers.indexOf(key);
const processedResult =
typeof result[key] === 'object'
? /**
* Handling for fields containing objects including nested objects?
* Just replace commas with colons for now */
JSON.stringify(result[key]).replaceAll(',', ';')
: JSON.stringify(result[key]);
// Match item to existing header columns
if (keyPosInHeader !== -1) {
// JSON.stringify here since fields can be ObjectId or Object
// Both are not compatible for conversion to string directly
row[keyPosInHeader] = processedResult;
} else {
row.push(processedResult);
}
});
stream.write(row.join(',') + ',\n');
}
stream.end();
});
};
const getDocsUsingAggregation = async () => {
const results = await db.getCollection(collectionName).aggregate(aggregationQuery).toArray(); // Returns output as JS array;
console.log(
`=======> getDocsUsingAggregation: results found: ${results?.length || 0}. Ex: `,
results[0],
);
return results;
};
const main = async () => {
if (!Object.values(availableExportFormats).includes(exportFormat)) {
throw new Error(`Invalid exportFormat selected: ${exportFormat}`);
}
try {
const scriptStartTime = performance.now();
const results = await getDocsUsingAggregation();
if (exportFormat === availableExportFormats.json) {
exportToJson(results);
}
if (exportFormat === availableExportFormats.csv) {
await exportToCsv(results);
}
const scriptEndTime = performance.now();
const used = process.memoryUsage().heapUsed / 1024 / 1024;
const executionTime = `${scriptEndTime - scriptStartTime} ms`;
console.log(`Script: ${scriptName} successful in ${executionTime}`);
console.log(`Script: ${scriptName} used approximately ${Math.round(used * 100) / 100} MB`);
} catch (error) {
console.error(`${scriptName} error`, {message: error?.message});
}
};
main()
.then(() => console.log(`Finished running script ${scriptName}`))
.catch((err) => {
console.error(err?.message);
})
.finally(() => process.exit());
Let’s go over the parts:
Script config
//###################################################################//
// Script configs. Set this way since mongosh cannot pass in command args
const scriptName = 'exportMongoDbToFile';
const availableExportFormats = {
json: 'json',
csv: 'csv',
};
const exportFormat = availableExportFormats.csv;
const exportFilePath = './temp/output'; // Don't include file format!
const connectionString = `mongodb+srv://username:password@mongo-hostname.
mongodb.net`;
const dbName = `sample_restaurants`;
const collectionName = 'restaurants';
const aggregationQuery = [
{$match: {cuisine: 'Italian'}},
{$project: {name: 1, cuisine: 1, borough: 1, address: 1}},
{$limit: 5},
];
//###################################################################//
This group of variables provides the configuration to run the entire script. They are hard-coded because mongosh cannot accept command line arguments (as of Mar 2023).
main()
const main = async () => {
if (!Object.values(availableExportFormats).includes(exportFormat)) {
throw new Error(`Invalid exportFormat selected: ${exportFormat}`);
}
try {
const scriptStartTime = performance.now();
const results = await getDocsUsingAggregation();
if (exportFormat === availableExportFormats.json) {
exportToJson(results);
}
if (exportFormat === availableExportFormats.csv) {
await exportToCsv(results);
}
const scriptEndTime = performance.now();
const used = process.memoryUsage().heapUsed / 1024 / 1024;
const executionTime = `${scriptEndTime - scriptStartTime} ms`;
console.log(`Script: ${scriptName} successful in ${executionTime}`);
console.log(`Script: ${scriptName} used approximately ${Math.round(used * 100) / 100} MB`);
} catch (error) {
console.error(`${scriptName} error`, {message: error?.message});
}
};
main()
is where all the big steps happen. It also logs metrics and errors that pop up when the script runs.
The steps are:
- Check if the specified export format is correct
- Use aggregation in
getDocsUsingAggregation()
to get all Docs matching the providedaggregationQuery
. I favouraggregation()
overfind()
because it has much more functionality. - Convert the gathered Docs into the specified export format
- End the script and log the execution time and memory use
I have put together 2 options to export the gathered Docs:
exportToJson()
const exportToJson = (results) => {
const stream = fs.createWriteStream(`${exportFilePath}.json`);
stream.once('open', function () {
stream.write('[' + '\n');
for (let i = 0; i < results.length; ++i) {
if (i === results.length - 1) {
stream.write(JSON.stringify(results[i]) + '\n');
continue;
}
stream.write(JSON.stringify(results[i]) + ',\n');
}
stream.write(']' + '\n');
stream.end();
});
};
Using stream.write
pushes out the results as JSON line-by-line. The first and last lines pushed are [
and ]
to form the JSON array. The catch (i === results.length — 1)
ensures the 2nd-last line does not end with a comma.
exportToCsv()
const exportToCsv = async (results) => {
const headers = [];
for (const result of results) {
Object.keys(result).forEach((key) => {
if (!headers.includes(key)) {
headers.push(key);
}
});
}
console.log(`exportToCsv: Headers accumulated: ${headers.join(',')}`);
const stream = fs.createWriteStream(`${exportFilePath}.csv`);
stream.once('open', function () {
stream.write(headers.join(',') + '\n');
for (const result of results) {
const row = new Array(headers.length).map((item) => '');
// Match obj.key to same index in header
Object.keys(result).forEach((key) => {
const keyPosInHeader = headers.indexOf(key);
const processedResult =
typeof result[key] === 'object'
? /**
* Handling for fields containing objects including nested objects?
* Just replace commas with colons for now */
JSON.stringify(result[key]).replaceAll(',', ';')
: JSON.stringify(result[key]);
// Match item to existing header columns
if (keyPosInHeader !== -1) {
// JSON.stringify here since fields can be ObjectId or Object
// Both are not compatible for conversion to string directly
row[keyPosInHeader] = processedResult;
} else {
row.push(processedResult);
}
});
stream.write(row.join(',') + ',\n');
}
stream.end();
});
};
With csv export, the concerns to handle are:
- Matching Document key to csv header: This ensures the keys in
headers[]
match the results in the columns. - Handling Objects and Arrays: These are comma-seperated in MongoDB. To keep them from splitting, their commas are replaced with a colon (
;
)
Result
From mongosh, run this script using:
mongosh --file “mongo-db-export.script.js"
This produces the following at the specified exportFilePath
:
As csv:
_id,address,borough,cuisine,name
"5eb3d668b31de5d588f42962",{"building":"10004";"coord":[-74.03400479999999;40.6127077];"street":"4 Avenue";"zipcode":"11209"},"Brooklyn","Italian","Philadelhia Grille Express",
"5eb3d668b31de5d588f42965",{"building":"1028";"coord":[-73.966032;40.762832];"street":"3 Avenue";"zipcode":"10065"},"Manhattan","Italian","Isle Of Capri Resturant",
"5eb3d668b31de5d588f42974",{"building":"251";"coord":[-73.9775552;40.7432016];"street":"East 31 Street";"zipcode":"10016"},"Manhattan","Italian","Marchis Restaurant",
"5eb3d668b31de5d588f4297f",{"building":"67";"coord":[-74.0707363;40.59321569999999];"street":"Olympia Boulevard";"zipcode":"10305"},"Staten Island","Italian","Crystal Room",
"5eb3d668b31de5d588f42988",{"building":"93";"coord":[-73.99950489999999;40.7169224];"street":"Baxter Street";"zipcode":"10013"},"Manhattan","Italian","Forlinis Restaurant",
As JSON:
[
{
"_id": "5eb3d668b31de5d588f42962",
"address": {
"building": "10004",
"coord": [
-74.03400479999999,
40.6127077
],
"street": "4 Avenue",
"zipcode": "11209"
},
"borough": "Brooklyn",
"cuisine": "Italian",
"name": "Philadelhia Grille Express"
},
{
"_id": "5eb3d668b31de5d588f42965",
"address": {
"building": "1028",
"coord": [
-73.966032,
40.762832
],
"street": "3 Avenue",
"zipcode": "10065"
},
"borough": "Manhattan",
"cuisine": "Italian",
"name": "Isle Of Capri Resturant"
},
{
"_id": "5eb3d668b31de5d588f42974",
"address": {
"building": "251",
"coord": [
-73.9775552,
40.7432016
],
"street": "East 31 Street",
"zipcode": "10016"
},
"borough": "Manhattan",
"cuisine": "Italian",
"name": "Marchis Restaurant"
},
{
"_id": "5eb3d668b31de5d588f4297f",
"address": {
"building": "67",
"coord": [
-74.0707363,
40.59321569999999
],
"street": "Olympia Boulevard",
"zipcode": "10305"
},
"borough": "Staten Island",
"cuisine": "Italian",
"name": "Crystal Room"
},
{
"_id": "5eb3d668b31de5d588f42988",
"address": {
"building": "93",
"coord": [
-73.99950489999999,
40.7169224
],
"street": "Baxter Street",
"zipcode": "10013"
},
"borough": "Manhattan",
"cuisine": "Italian",
"name": "Forlinis Restaurant"
}
]
Give it a try! With such a script, you can also add capabilities not currently in mongosh such as transformations and sub-filters after the query.
References
Footnotes
I did try passing in aggregation()
and find()
commands via the --eval
flag in mongosh. However, aggregation()
simply won’t work. Plus, the output of find()
isn’t proper JSON. The closest I got was to print()
as json then wrap the output in []
.
// This errors on seeing $xxx
mongosh "mongodb+srv://your-connection-string/sample_restaurants" --norc --quiet --eval "db.restaurants.aggregate([{$match: {cuisine: 'Italian'}}, {$project: {borough: 1, _id: 1}}, {$limit: 5}])" >| output.json
// This works, but resulting output.json shows lines of JSON objects
// Have to manually remove last comma and wrap in [] to become proper JSON
// Need to first remove line limit by setting
// `config.set("displayBatchSize", 999999999999)` in mongorc.js
mongosh "your-db-connection-string/sample_restaurants" --norc --quiet --eval "db.getCollection('restaurants').find({cuisine: 'Italian'}, {borough: 1, _id: 1} ).limit(5).forEach((doc)=>print(JSON.stringify(doc)+','))" \>| output.json