How to build a Web Crawler in Node.js
In this article, we will learn how to build a simple web crawler in Node.js using axios
and cheerio
libraries.
Initialize Node.js Project
mkdir web-crawler
cd web-crawler
npm init -y
Install dependencies
npm install axios cheerio
Create Basic Web Crawler
Let’s create app.js
file.
const axios = require('axios');
const { JSDOM } = require('jsdom');
const url = require('url');
const startUrl = 'https://example.com';
const maxDepth = 2;
const visitedUrls = new Set();
async function crawl(targetUrl, currentDepth = 0) {
if (currentDepth > maxDepth || visitedUrls.has(targetUrl)) {
return;
}
visitedUrls.add(targetUrl);
try {
const response = await axios.get(targetUrl);
const dom = new JSDOM(response.data);
const document = dom.window.document;
console.log(`Crawling: ${targetUrl}`);
// Process the page content here
// For example, you can extract and log all the links:
const links = document.querySelectorAll('a');
links.forEach(link => {
const href = link.getAttribute('href');
if (href) {
const absoluteUrl = url.resolve(targetUrl, href);
console.log(`Found link: ${absoluteUrl}`);
crawl(absoluteUrl, currentDepth + 1);
}
});
} catch (error) {
console.error(`Error crawling ${targetUrl}: ${error.message}`);
}
}
crawl(startUrl);
Run the Web Crawler
node app.js
This will start the web crawler and crawl the website starting from the startUrl
up to the maxDepth
specified.