Skip to content

Page Objects

This guide covers inspecting page objects — the individual elements (text, images, paths, etc.) that make up a PDF page.

PDF pages contain multiple page objects:

  • Text objects — Text strings with font and position
  • Image objects — Embedded images
  • Path objects — Lines, curves, and shapes
  • Shading objects — Gradient fills
  • Form objects — XObject forms (reusable content)

Use getObjects() to retrieve all objects on a page:

using pdfium = await PDFium.init();
using document = await pdfium.openDocument(pdfData);
using page = document.getPage(0);
const objects = page.getObjects();
console.log(`Page has ${objects.length} objects`);

Each object has a type property from PageObjectType:

import { PageObjectType } from '@scaryterry/pdfium';
for (const obj of objects) {
switch (obj.type) {
case PageObjectType.Text:
console.log(`Text: "${obj.text}" (${obj.fontSize}pt)`);
break;
case PageObjectType.Image:
console.log(`Image: ${obj.width}×${obj.height}`);
break;
case PageObjectType.Path:
console.log('Path object');
break;
case PageObjectType.Shading:
console.log('Shading object');
break;
case PageObjectType.Form:
console.log('Form XObject');
break;
case PageObjectType.Unknown:
console.log('Unknown object type');
break;
}
}
ValueDescription
UnknownUnknown or unsupported object type
TextText string with font information
PathVector path (lines, curves, shapes)
ImageEmbedded bitmap image
ShadingGradient or shading pattern
FormForm XObject (reusable content group)

All page objects extend PDFiumPageObject, which provides:

import { PDFiumPageObject } from '@scaryterry/pdfium';
obj.type; // PageObjectType
obj.bounds; // { left, top, right, bottom } in points
obj.fillColour; // Colour | null
obj.strokeColour; // Colour | null
obj.strokeWidth; // number | null
obj.matrix; // TransformMatrix | null
obj.lineCap; // LineCapStyle
obj.lineJoin; // LineJoinStyle
obj.dashPattern; // DashPattern | null
obj.hasTransparency; // boolean
obj.rotatedBounds; // QuadPoints | null
obj.markCount; // number — content marks
obj.marks; // PageObjectMark[]

Text objects expose content, font size, render mode, and font access:

import { PDFiumTextObject } from '@scaryterry/pdfium';
// PDFiumTextObject extends PDFiumPageObject
obj.text; // The text content
obj.fontSize; // Font size in points
obj.renderMode; // TextRenderMode (Fill, Stroke, etc.)
obj.getFont(); // PDFiumFont | null (dispose when done)

Image objects expose dimensions, raw data, and metadata:

import { PDFiumImageObject } from '@scaryterry/pdfium';
// PDFiumImageObject extends PDFiumPageObject
obj.width; // Image width in pixels
obj.height; // Image height in pixels
obj.getDecodedData(); // Uint8Array | null — decoded pixel data
obj.getRawData(); // Uint8Array | null — raw compressed data
obj.getMetadata(); // ImageMetadata | null — colour space, bpp, etc.

Path objects expose segments and drawing operations:

import { PDFiumPathObject, PDFiumPathSegment } from '@scaryterry/pdfium';
// PDFiumPathObject extends PDFiumPageObject
obj.segmentCount; // Number of segments
obj.getSegment(0); // PDFiumPathSegment | null
obj.getDrawMode(); // { fillMode, stroke } | null
// PDFiumPathSegment
segment.point; // { x, y } | null
segment.type; // PathSegmentType (MoveTo, LineTo, BezierTo)
segment.isClosing; // Whether this segment closes the subpath

These only have the base properties (type and bounds).

import { PageObjectType, PDFiumTextObject, PDFiumImageObject } from '@scaryterry/pdfium';
const objects = page.getObjects();
// Get only text objects
const textObjects = objects.filter(
(obj): obj is PDFiumTextObject => obj.type === PageObjectType.Text
);
// Get only images
const imageObjects = objects.filter(
(obj): obj is PDFiumImageObject => obj.type === PageObjectType.Image
);
function getTextWithPositions(page: PDFiumPage) {
const objects = page.getObjects();
return objects
.filter((obj): obj is PDFiumTextObject => obj.type === PageObjectType.Text)
.map(obj => ({
text: obj.text,
x: obj.bounds.left,
y: obj.bounds.bottom,
fontSize: obj.fontSize,
}));
}
const textItems = getTextWithPositions(page);
for (const item of textItems) {
console.log(`"${item.text}" at (${item.x}, ${item.y}), ${item.fontSize}pt`);
}
function getObjectsInRegion(
page: PDFiumPage,
left: number,
bottom: number,
right: number,
top: number,
): PageObject[] {
return page.getObjects().filter(obj => {
const b = obj.bounds;
return b.left >= left && b.right <= right &&
b.bottom >= bottom && b.top <= top;
});
}
// Get objects in top half of page
const topHalf = getObjectsInRegion(page, 0, 396, 612, 792);
function countObjectsByType(page: PDFiumPage): Record<string, number> {
const counts: Record<string, number> = {};
for (const obj of page.getObjects()) {
const typeName = PageObjectType[obj.type];
counts[typeName] = (counts[typeName] ?? 0) + 1;
}
return counts;
}
const counts = countObjectsByType(page);
console.log(counts);
// { Text: 45, Image: 3, Path: 12 }
function findLargestImage(page: PDFiumPage): PDFiumImageObject | undefined {
const images = page.getObjects().filter(
(obj): obj is PDFiumImageObject => obj.type === PageObjectType.Image
);
if (images.length === 0) return undefined;
return images.reduce((largest, current) => {
const largestArea = largest.width * largest.height;
const currentArea = current.width * current.height;
return currentArea > largestArea ? current : largest;
});
}
const largest = findLargestImage(page);
if (largest) {
console.log(`Largest image: ${largest.width}×${largest.height}`);
}

Text objects provide access to their font via getFont(), which returns a PDFiumFont instance:

import { PDFiumTextObject, PDFiumFont, PageObjectType } from '@scaryterry/pdfium';
using page = document.getPage(0);
for (const obj of page.objects()) {
if (obj.type === PageObjectType.Text) {
const textObj = obj as PDFiumTextObject;
using font = textObj.getFont();
if (font) {
console.log(`Family: ${font.familyName}`);
console.log(`PostScript name: ${font.fontName}`);
console.log(`Weight: ${font.weight}`);
console.log(`Italic angle: ${font.italicAngle}`);
console.log(`Embedded: ${font.isEmbedded}`);
}
}
}
PropertyTypeDescription
familyNamestringFamily name (e.g., ‘Helvetica’, ‘Times New Roman’)
fontNamestringPostScript/base font name
weightnumberFont weight (100–900; 400 = normal, 700 = bold)
italicAnglenumberItalic angle in degrees (0 for upright)
isEmbeddedbooleanWhether the font data is embedded in the PDF
isFixedPitchbooleanWhether the font is monospaced
isSerifbooleanWhether the font has serifs
isItalicbooleanWhether the font is italic
isBoldbooleanWhether the font is bold (weight >= 700 or ForceBold flag)
MethodReturnsDescription
getInfo()FontInfoAll font properties in a single object (more efficient than individual reads)
getMetrics(fontSize)FontMetricsAscent and descent at a given font size in points
getGlyphWidth(glyphIndex, fontSize)numberWidth of a specific glyph at a given size
getFontData()Uint8Array | undefinedRaw embedded font data (returns undefined if not embedded)

Collect unique fonts across an entire document:

async function collectFonts(document: PDFiumDocument) {
const fonts = new Map<string, FontInfo>();
for (let i = 0; i < document.pageCount; i++) {
using page = document.getPage(i);
for (const obj of page.objects()) {
if (obj.type !== PageObjectType.Text) continue;
const textObj = obj as PDFiumTextObject;
using font = textObj.getFont();
if (!font) continue;
const info = font.getInfo();
if (!fonts.has(info.familyName)) {
fonts.set(info.familyName, info);
}
}
}
return [...fonts.values()];
}
async function analyseDocument(pdfData: Uint8Array) {
using pdfium = await PDFium.init();
using document = await pdfium.openDocument(pdfData);
const analysis = {
totalPages: document.pageCount,
totalObjects: 0,
textObjects: 0,
imageObjects: 0,
pathObjects: 0,
otherObjects: 0,
};
for (let i = 0; i < document.pageCount; i++) {
using page = document.getPage(i);
const objects = page.getObjects();
analysis.totalObjects += objects.length;
for (const obj of objects) {
switch (obj.type) {
case PageObjectType.Text:
analysis.textObjects++;
break;
case PageObjectType.Image:
analysis.imageObjects++;
break;
case PageObjectType.Path:
analysis.pathObjects++;
break;
default:
analysis.otherObjects++;
}
}
}
return analysis;
}
import { PDFium, PageObjectType, PDFiumTextObject, PDFiumImageObject } from '@scaryterry/pdfium';
import { promises as fs } from 'fs';
async function inspectPageObjects(filePath: string) {
const pdfData = await fs.readFile(filePath);
using pdfium = await PDFium.init();
using document = await pdfium.openDocument(pdfData);
console.log(`Document: ${filePath}`);
console.log(`Pages: ${document.pageCount}\n`);
for (let pageIndex = 0; pageIndex < document.pageCount; pageIndex++) {
using page = document.getPage(pageIndex);
const objects = page.getObjects();
console.log(`=== Page ${pageIndex + 1} ===`);
console.log(`Objects: ${objects.length}`);
// Group by type
const byType = new Map<PageObjectType, PageObject[]>();
for (const obj of objects) {
const list = byType.get(obj.type) ?? [];
list.push(obj);
byType.set(obj.type, list);
}
// Report text objects
const textObjs = byType.get(PageObjectType.Text) as PDFiumTextObject[] | undefined;
if (textObjs?.length) {
console.log(`\nText objects (${textObjs.length}):`);
for (const text of textObjs.slice(0, 5)) {
const preview = text.text.substring(0, 40);
console.log(` "${preview}${text.text.length > 40 ? '...' : ''}" (${text.fontSize}pt)`);
}
if (textObjs.length > 5) {
console.log(` ... and ${textObjs.length - 5} more`);
}
}
// Report image objects
const imageObjs = byType.get(PageObjectType.Image) as PDFiumImageObject[] | undefined;
if (imageObjs?.length) {
console.log(`\nImage objects (${imageObjs.length}):`);
for (const img of imageObjs) {
const { left, bottom, right, top } = img.bounds;
console.log(` ${img.width}×${img.height}px at (${left.toFixed(0)}, ${bottom.toFixed(0)})`);
}
}
// Report other types
const pathCount = byType.get(PageObjectType.Path)?.length ?? 0;
const shadingCount = byType.get(PageObjectType.Shading)?.length ?? 0;
const formCount = byType.get(PageObjectType.Form)?.length ?? 0;
if (pathCount + shadingCount + formCount > 0) {
console.log(`\nOther objects:`);
if (pathCount) console.log(` Paths: ${pathCount}`);
if (shadingCount) console.log(` Shadings: ${shadingCount}`);
if (formCount) console.log(` Forms: ${formCount}`);
}
console.log('');
}
}
inspectPageObjects('document.pdf').catch(console.error);
  • getObjects() loads all objects into memory at once — use objects() generator for lazy iteration on large pages
  • The objects() generator yields objects one at a time without array allocation
  • Object bounds are computed on demand and cached
  • Shading and Form XObject content is not inspectable beyond bounds
  • Image raw data requires calling getDecodedData() or getRawData() on the PDFiumImageObject directly