This commit is contained in:
Thomas Forgione 2023-02-19 10:42:30 +01:00
parent e5cca75737
commit 58afd8ac3e
2 changed files with 67 additions and 171 deletions

123
index.js
View File

@ -4,11 +4,14 @@ const fs = require('fs').promises;
const process = require('process'); const process = require('process');
const puppeteer = require('puppeteer'); const puppeteer = require('puppeteer');
// Size of the rendering of the web page
const size = { width: 1280, height: 720 };
async function main() { async function main() {
if (process.argv[2] === undefined) { if (process.argv[2] === undefined) {
console.error("This program expects an argument."); console.error('This program expects an argument.');
console.error("USAGE: locator <path-to-HTML-file>"); console.error('USAGE: locator <path-to-HTML-file>');
process.exit(1); process.exit(1);
} }
@ -20,44 +23,25 @@ async function main() {
try { try {
await fs.access(path, fs.constants.F_OK); await fs.access(path, fs.constants.F_OK);
} catch (e) { } catch (e) {
console.error("No such file: " + path); console.error('No such file: ' + path);
process.exit(1); process.exit(1);
} }
// Size of the rendering of the web page
const size = { width: 1280, height: 720 };
// Initialize browser // Initialize browser
const browser = await puppeteer.launch(); const browser = await puppeteer.launch();
const page = await browser.newPage(); const page = await browser.newPage();
await page.setViewport(size); await page.setViewport(size);
await page.goto("file://" + path); await page.goto('file://' + path);
// This will contain all the collected information // Only consider the first slide (#\\331 === #1, which is the id of the first slide)
let info = []; // We don't take into account the other slides because it will mess up with our screenshot varification
let root = await page.$('#\\31');
// Take a first screenshot // Take a first screenshot
await page.screenshot({path: __dirname + '/' + 'screenshot1.png'}); await page.screenshot({path: __dirname + '/' + 'screenshot1.png'});
// Edit the page to shrink elements in order to get better bounding boxes // Edit the page to shrink elements in order to get better bounding boxes
for (let selector of ["h1", "h2", "h3", "h4", "h5", "h6", "a", "img", "p", "ul", "ol", "li"]) { let withSpan = await addSpan(root);
let query = selector;
let shouldCreateSpan = query !== "ul" && query !== "ol" && query != "img";
if (shouldCreateSpan) {
await page.evaluate(([query]) => {
for (let e of document.querySelectorAll(query)) {
if (e.children.length === 0) {
e.innerHTML = '<span>' + e.innerHTML + '</span>';
}
}
}, [query]);
}
}
// Take another screenshot and check the modification we made didn't change the layout of the page // Take another screenshot and check the modification we made didn't change the layout of the page
await page.screenshot({path: __dirname + '/' + 'screenshot2.png'}); await page.screenshot({path: __dirname + '/' + 'screenshot2.png'});
@ -72,63 +56,64 @@ async function main() {
throw new Error("Page edit changed the layout"); throw new Error("Page edit changed the layout");
} }
for (let selector of ["h1", "h2", "h3", "h4", "h5", "h6", "a", "img", "p", "ul", "ol", "li"]) { // Analyse the root and output the result
let analyse = await analyseElement(root);
console.log(JSON.stringify(analyse, undefined, 4));
let query = selector; await browser.close();
// Shrink the elements horizontally (to be able to get better bounding boxes) }
let shouldCreateSpan = query !== "ul" && query !== "ol" && query != "img";
// Query the considered element // Traverses the text nodes of the element and put every text into a single span.
let parents = await page.$$(query); async function addSpan(element) {
let elements = await page.$$(query + (shouldCreateSpan ? ' > *:first-child' : '')); let elts = await element.$$('*');
for (let index = 0; index < elements.length; index ++) { for (let elt of elts) {
let value = await elt.evaluate(el => el.textContent, element);
if (value !== "") {
await elt.evaluate(el => el.innerHTML = '<span>' + el.innerHTML + '</span>');
}
}
}
let parent = parents[index]; // Recursive function to analyse an HTML element.
let element = elements[index]; // The output is written in hierarchy.
async function analyseElement(element) {
// Get some information on the element
let tagAttr = await element.getProperty('tagName');
let tagName = await tagAttr.jsonValue();
let classAttr = await element.getProperty('className');
let className = await classAttr.jsonValue();
let textContent = await element.evaluate(el => el.textContent, element);
let box = await element.boundingBox(); let box = await element.boundingBox();
let classElement = shouldCreateSpan ? parent : element; // Register it into the return value
let classNameAttr = await classElement.getProperty('className'); let analyse = {};
let className = await classNameAttr.jsonValue(); analyse.tag = tagName;
analyse.class = className;
analyse.box = box;
analyse.children = [];
// Scale the bounding box // Extract the text content if it is a span (we made those spans by ourselves in the addSpan function)
box.x /= size.width; if (tagName === 'SPAN' && textContent !== "") {
box.width /= size.width; analyse.text = textContent;
box.y /= size.height;
box.height /= size.height;
// Give the selector as type
box.type = selector;
if (className !== "") {
box.class = className;
} }
info.push(box);
// Select the children of this HTML element.
let children = await element.$$('> *');
for (let child of children) {
// Recursively analyse the children
analyse.children.push(await analyseElement(child));
} }
return analyse;
} }
// Sort the info by y and the x (top to bottom, then left to right)
info.sort((a, b) => {
if (a.y < b.y || (a.y == b.y && a.x < b.x)) {
return -1;
}
if (a.y > b.y || (a.y == b.y && a.x > b.x)) {
return 1;
}
return 0;
});
// Pretty print the output info
console.log(JSON.stringify(info, undefined, 4));
await browser.close();
}
main(); main();

View File

@ -1,89 +0,0 @@
#!/usr/bin/env node
const fs = require('fs').promises;
const process = require('process');
const puppeteer = require('puppeteer');
// Size of the rendering of the web page
const size = { width: 1280, height: 720 };
async function main() {
if (process.argv[2] === undefined) {
console.error("This program expects an argument.");
console.error("USAGE: locator <path-to-HTML-file>");
process.exit(1);
}
// Path to the HTML file to analyse (given as relative path from current directory)
// We need the full path so that puppeteer is able to access it
const path = process.argv[2].startsWith('/') ? process.argv[2] : process.cwd() + '/' + process.argv[2];
// Check that the file exists
try {
await fs.access(path, fs.constants.F_OK);
} catch (e) {
console.error("No such file: " + path);
process.exit(1);
}
// Initialize browser
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setViewport(size);
await page.goto("file://" + path);
let currentSlide = 1;
let hierarchy = [];
while (true) {
let root = await page.$("#\\3" + currentSlide);
if (root === null) {
break;
}
let currentInfo = {};
hierarchy.push(currentInfo);
await analyseElement(root, currentInfo);
currentSlide++;
}
console.log(JSON.stringify(hierarchy, undefined, 4));
await browser.close();
}
async function analyseElement(element, hierarchy, tabs = '', stop = false) {
let tagAttr = await element.getProperty("tagName");
let tagName = await tagAttr.jsonValue();
let classAttr = await element.getProperty("className");
let className = await classAttr.jsonValue();
let box = await element.boundingBox();
hierarchy.tag = tagName;
hierarchy.class = className;
hierarchy.x = box.x / size.width;
hierarchy.width = box.width / size.width;
hierarchy.y = box.y / size.height;
hierarchy.height = box.height / size.height;
hierarchy.children = [];
console.log(tabs + tagName + ' "' + className + '" ' + JSON.stringify(box));
let children = await element.$$('> *');
for (let child of children) {
let currentInfo = {};
hierarchy.children.push(currentInfo);
await analyseElement(child, currentInfo, tabs + ' ', true);
}
}
main();