This is the twelfth part of the Chatterbox series. For your convenience you can find other parts in the table of contents in Part 1 – Origins

Scraping is hard so we should avoid doing that as much as possible. Some pages allow us to get our hands on the logical model with some tricks. Let’s go with a couple of examples. Keep in mind I’m writing this in early 2022 so if you read it later then actual class names or handles may be no longer valid.

Table of Contents

Whatsapp Web

Whatsapp uses React under the hood so first we need to have a helper method for getting React’s props:

// Based on https://stackoverflow.com/a/39165137/1543037
window.FindReact = function FindReact(dom, traverseUp = 0) {
	const key = Object.keys(dom).find(key=>{
		return key.startsWith("__reactFiber$") // react 17+
			|| key.startsWith("__reactInternalInstance$"); // react below 17
	});
	const domFiber = dom[key];
	if (domFiber == null) return null;

	// react below 16
	if (domFiber._currentElement) {
		let compFiber = domFiber._currentElement._owner;
		for (let i = 0; traverseUp > i; i++) {
			compFiber = compFiber._currentElement._owner;
		}
		return compFiber._instance;
	}

	// react 16+
	const GetCompFiber = fiber=>{
		//return fiber._debugOwner; // this also works, but is __DEV__ only
		let parentFiber = fiber.return;
		while (typeof parentFiber.type == "string") {
			parentFiber = parentFiber.return;
		}
		return parentFiber;
	};
	let compFiber = GetCompFiber(domFiber);
	for (let i = 0; traverseUp > i; i++) {
		compFiber = GetCompFiber(compFiber);
	}
	return compFiber.stateNode;
}

// Based on https://stackoverflow.com/a/39165137/1543037

window.FindReact = function FindReact(dom, traverseUp = 0) {

const key = Object.keys(dom).find(key=>{

return key.startsWith("__reactFiber$") // react 17+

|| key.startsWith("__reactInternalInstance$"); // react below 17

});

const domFiber = dom[key];

if (domFiber == null) return null;

// react below 16

if (domFiber._currentElement) {

let compFiber = domFiber._currentElement._owner;

for (let i = 0; traverseUp > i; i++) {

compFiber = compFiber._currentElement._owner;

}

return compFiber._instance;

}

// react 16+

const GetCompFiber = fiber=>{

//return fiber._debugOwner; // this also works, but is __DEV__ only

let parentFiber = fiber.return;

while (typeof parentFiber.type == "string") {

parentFiber = parentFiber.return;

}

return parentFiber;

};

let compFiber = GetCompFiber(domFiber);

for (let i = 0; traverseUp > i; i++) {

compFiber = GetCompFiber(compFiber);

}

return compFiber.stateNode;

}

We can use this method to get the model from the page easily. Open some chat window and then do this:

FindReact(document.querySelectorAll("div[aria-label='Message list. Press right arrow key on a message to open message context menu.']")[0], 0).props.children.filter(p => p.props.msg).map(p => p.props.msg).map(p => parseSingleMessage(p))

1	FindReact(document.querySelectorAll("div[aria-label='Message list. Press right arrow key on a message to open message context menu.']")[0], 0).props.children.filter(p => p.props.msg).map(p => p.props.msg).map(p => parseSingleMessage(p))

And there you go. Now, the method to parse messages (to give you an idea of how it looks like):

window.parseSingleMessage = function parseSingleMessage(p){
	var isMedia = p.__x_isMedia;
	var fromMe = p.__x_id.fromMe || p.__x_isSentByMe || p.__x_isSentByMeFromWeb
	var wasRead = p.collection.__x_ack > 0;
	var quotted = ((p.__x_quotedMsg && p.__x_quotedMsg.body) || "");
	var timestamp = p.__x_t;
	var contactName = p.__x_chat.__x_contact.__x_formattedName;
	return { ... };
}

window.parseSingleMessage = function parseSingleMessage(p){

var isMedia = p.__x_isMedia;

var fromMe = p.__x_id.fromMe || p.__x_isSentByMe || p.__x_isSentByMeFromWeb

var wasRead = p.collection.__x_ack > 0;

var quotted = ((p.__x_quotedMsg && p.__x_quotedMsg.body) || "");

var timestamp = p.__x_t;

var contactName = p.__x_chat.__x_contact.__x_formattedName;

return { ... };

}

You can also extract everything from IndexedDB as show in Whatsapp Backup.

You can also get it from the memory dump, obviously.

Skype

You can get messages from IndexedDB using web.skype.com. However, you can also go to outlook.live.com and open the Skype pane which is implemented with Knockout. To get the model just run

ko.dataFor(document.querySelectorAll(".conversation")[0]).conversation

1	ko.dataFor(document.querySelectorAll(".conversation")[0]).conversation

Teams

Similar trick with IndexedDB:

var request = window.indexedDB.open("space_id");
request.onerror = event => console.log("error");
request.onsuccess = event => {
	var db = event.target.result;
	var transaction = db.transaction(["replychains"]);
	var objectStore = transaction.objectStore("replychains").openCursor().onsuccess = function(event) {
		var cursor = event.target.result;
		if (cursor) {
			for(var property in cursor.value.messages){
				var message = cursor.value.messages[property];
			}
		}
	}
}

var request = window.indexedDB.open("space_id");

request.onerror = event => console.log("error");

request.onsuccess = event => {

var db = event.target.result;

var transaction = db.transaction(["replychains"]);

var objectStore = transaction.objectStore("replychains").openCursor().onsuccess = function(event) {

var cursor = event.target.result;

if (cursor) {

for(var property in cursor.value.messages){

var message = cursor.value.messages[property];

}

Messenger

This one we can scrape with Chrome extension capturing the network traffic. Before moving on, a couple of ground rules.

First, we’re going to implement an extension for the devtools pane. This means that you need to open a devtools before browsing to the messanger page. You can do that automatically with Pupeteer.

First, the manifest:

{
  "manifest_version": 2,
  "name": "Scraper",
  "description": "Scrape messages on messenger.com",
  "version": "0.1",
  "minimum_chrome_version": "10.0",
  "permissions": [
    "http://*/*",
    "https://*/*"
  ],
  "devtools_page": "devtools.html",
  "icons": {
    "16": "icon.png",
    "32": "icon.png",
    "48": "icon.png",
    "128": "icon.png"
  }
}

{

"manifest_version": 2,

"name": "Scraper",

"description": "Scrape messages on messenger.com",

"version": "0.1",

"minimum_chrome_version": "10.0",

"permissions": [

"http://*/*",

"https://*/*"

"devtools_page": "devtools.html",

"icons": {

"16": "icon.png",

"32": "icon.png",

"48": "icon.png",

"128": "icon.png"

}

Now, devtools.html:

< !DOCTYPE html>
< html>
< head>
  < script src="devtools.js" type="text/javascript"></script>
< /head>
< body>
  
< /body>
< /html>

< !DOCTYPE html>

< html>

< head>

< script src="devtools.js" type="text/javascript"></script>

< /head>

< body>

< /body>

< /html>

And now the actual devtools code:

var all = [];

chrome.devtools.network.onRequestFinished.addListener(
  function (request) {
	if (request.request.url.indexOf("facebook") > -1 && request.request.url.indexOf("graphqlbatch") > -1) {
	  request.getContent(function(content, encoding) {
		var response = content.substring(0, content.lastIndexOf('{'));
		var data = JSON.parse(response);

		var message_thread = data.o0.data.message_thread;
		if (!message_thread) return;

		var messages = message_thread.messages;
		if (!messages) return;

		messages.nodes.forEach(message => {
		  if (all.filter(orig => {
			return orig.timestamp_precise === message.timestamp_precise;
		  }).length === 0) {
			if(message && message.message && message.message.text){
				message.message.text = message.message.text.replace(/"/g, ",,").replace(/'/g, ",,");
			}
			message.thread_fbid = message_thread.thread_key.thread_fbid || message_thread.thread_key.other_user_id;
			all.push(message);
		  }
		})

		chrome.tabs.query({}, function(tabs) { 
			for(var i = 0; i < tabs.length;++i){
				if(tabs[i].url && tabs[i].url.indexOf("facebook.com") >= 0){
					console.log("Sending to " + tabs[i].id);
					chrome.tabs.executeScript(tabs[i].id, {
						code: "console.log('location:', window.location.href);"
					});
					chrome.tabs.executeScript(tabs[i].id, {
						code: "document.getElementsByTagName('body')[0].setAttribute('current-messages', '" + btoa(unescape(encodeURIComponent((JSON.stringify(all))))) + "');"
					});
				}
			}
		});
	  });
	}
  });

var all = [];

chrome.devtools.network.onRequestFinished.addListener(

function (request) {

if (request.request.url.indexOf("facebook") > -1 && request.request.url.indexOf("graphqlbatch") > -1) {

request.getContent(function(content, encoding) {

var response = content.substring(0, content.lastIndexOf('{'));

var data = JSON.parse(response);

var message_thread = data.o0.data.message_thread;

if (!message_thread) return;

var messages = message_thread.messages;

if (!messages) return;

messages.nodes.forEach(message => {

if (all.filter(orig => {

return orig.timestamp_precise === message.timestamp_precise;

}).length === 0) {

if(message && message.message && message.message.text){

message.message.text = message.message.text.replace(/"/g, ",,").replace(/'/g, ",,");

}

message.thread_fbid = message_thread.thread_key.thread_fbid || message_thread.thread_key.other_user_id;

all.push(message);

}

})

chrome.tabs.query({}, function(tabs) {

for(var i = 0; i < tabs.length;++i){

if(tabs[i].url && tabs[i].url.indexOf("facebook.com") >= 0){

console.log("Sending to " + tabs[i].id);

chrome.tabs.executeScript(tabs[i].id, {

code: "console.log('location:', window.location.href);"

});

chrome.tabs.executeScript(tabs[i].id, {

code: "document.getElementsByTagName('body')[0].setAttribute('current-messages', '" + btoa(unescape(encodeURIComponent((JSON.stringify(all))))) + "');"

});

}

});

}

});

This code captures requests to graphqlbatch and the extracts messages from the response. It sends them to the facebook tab and stores in current-messages attribute of the body tag. Now, you need to grab those messages with JS:

JSON.parse(decodeURIComponent(escape(atob((document.getElementsByTagName('body')[0].getAttribute('current-messages')))))).map(m => {
	var messageText = m.message.text.trim();
	var conversation = m.thread_fbid;
});

JSON.parse(decodeURIComponent(escape(atob((document.getElementsByTagName('body')[0].getAttribute('current-messages')))))).map(m => {

var messageText = m.message.text.trim();

var conversation = m.thread_fbid;

});

And that’s it.

Twitter

Just like in Messenger but we’re looking for requests with inbox_initial_state in the response body. Then we can parse.

Discord

Same idea, request to /messages

Slack

Same, request to conversations.history

Summary

There are multiple ways of getting models from the page. We can read it from popular JS frameworks, extract from IndexedDB, or parse network calls. Next time we’ll see how to trace them on the fly.