libmobi
C library for handling MOBI format ebook documents
Macros | Functions
parse_rawml.c File Reference

Functions for parsing rawml markup. More...

#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "parse_rawml.h"
#include "util.h"
#include "opf.h"
#include "structure.h"
#include "index.h"
#include "debug.h"

Macros

#define _GNU_SOURCE   1
 
#define __USE_BSD   /* for strdup on linux/glibc */
 

Functions

size_t mobi_get_rawlink_location (const MOBIRawml *rawml, const uint32_t pos_fid, const uint32_t pos_off)
 Convert kindle:pos:fid:x:off:y to offset in rawml raw text file. More...
 
MOBI_RET mobi_search_links_kf7 (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end)
 Find first occurence of attribute to be replaced in KF7 html. More...
 
MOBI_RET mobi_find_attrvalue (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end, const MOBIFiletype type, const char *needle)
 Find first occurence of markup attribute with given value. More...
 
MOBI_RET mobi_find_attrname (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end, const char *attrname)
 Find first occurence of markup attribute with given name. More...
 
MOBI_RET mobi_search_links_kf8 (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end, const MOBIFiletype type)
 Find first occurence of attribute part to be replaced in KF8 html/css. More...
 
size_t mobi_get_attribute_value (char *value, const unsigned char *data, const size_t size, const char *attribute, bool only_quoted)
 Get value and offset of the first found attribute with given name. More...
 
size_t mobi_get_aid_offset (const MOBIPart *html, const char *aid)
 Get offset of the given value of an "aid" attribute in a given part. More...
 
MOBI_RET mobi_get_offset_by_posoff (uint32_t *file_number, size_t *offset, const MOBIRawml *rawml, const size_t pos_fid, const size_t pos_off)
 Convert kindle:pos:fid:x:off:y to skeleton part number and offset from the beginning of the part. More...
 
MOBI_RET mobi_get_aid_by_offset (char *aid, const MOBIPart *html, const size_t offset)
 Get value of the closest "aid" attribute following given offset in a given part. More...
 
MOBI_RET mobi_get_id_by_offset (char *id, const MOBIPart *html, const size_t offset, MOBIAttrType *pref_attr)
 Get value of the closest "id" or "name" attribute following given offset in a given part. More...
 
MOBI_RET mobi_get_aid_by_posoff (uint32_t *file_number, char *aid, const MOBIRawml *rawml, const size_t pos_fid, const size_t pos_off)
 Convert kindle:pos:fid:x:off:y to html file number and closest "aid" attribute following the position. More...
 
MOBI_RET mobi_get_id_by_posoff (uint32_t *file_number, char *id, const MOBIRawml *rawml, const size_t pos_fid, const size_t pos_off, MOBIAttrType *pref_attr)
 Convert kindle:pos:fid:x:off:y to html file number and closest "id" attribute following the position. More...
 
MOBI_RET mobi_reconstruct_resources (const MOBIData *m, MOBIRawml *rawml)
 Parse resource records (images, fonts etc), determine their type, link to rawml. More...
 
MOBI_RET mobi_process_replica (unsigned char *pdf, const char *text, size_t *length)
 Parse Replica Print ebook (azw4). Extract pdf. More...
 
MOBI_RET mobi_reconstruct_flow (MOBIRawml *rawml, const char *text, const size_t length)
 Parse raw text into flow parts. More...
 
MOBI_RET mobi_reconstruct_parts (MOBIRawml *rawml)
 Parse raw html into html parts. Use index entries if present to parse file. More...
 
MOBI_RET mobi_get_filepos_array (MOBIArray *links, const MOBIPart *part)
 Scan html part and build array of filepos link target offsets. More...
 
MOBI_RET mobi_get_ncx_filepos_array (MOBIArray *links, const MOBIRawml *rawml)
 Scan ncx part and build array of filepos link target offsets. More...
 
MOBI_RET mobi_posfid_to_link (char *link, const MOBIRawml *rawml, const char *value, MOBIAttrType *pref_attr)
 Replace kindle:pos link with html href. More...
 
MOBI_RET mobi_flow_to_link (char *link, const MOBIRawml *rawml, const char *value)
 Replace kindle:flow link with html href. More...
 
MOBI_RET mobi_embed_to_link (char *link, const MOBIRawml *rawml, const char *value)
 Replace kindle:embed link with html href. More...
 
MOBI_RET mobi_reconstruct_links_kf8 (const MOBIRawml *rawml)
 Replace offset-links with html-links in KF8 markup. More...
 
MOBI_RET mobi_reconstruct_infl (char *outstring, const MOBIIndx *infl, const MOBIIndexEntry *orth_entry)
 Get infl index markup for given orth entry. More...
 
MOBI_RET mobi_reconstruct_infl_v1 (char *outstring, MOBITrie *const infl_tree, const MOBIIndexEntry *orth_entry)
 Get infl index markup for given orth entry. More...
 
MOBI_RET mobi_reconstruct_orth (const MOBIRawml *rawml, MOBIFragment *first, size_t *new_size)
 Insert orth index markup to linked list of fragments. More...
 
MOBI_RET mobi_reconstruct_links_kf7 (const MOBIRawml *rawml)
 Replace offset-links with html-links in KF7 markup. Also reconstruct dictionary markup if present. More...
 
MOBI_RET mobi_reconstruct_links (const MOBIRawml *rawml)
 Replace offset-links with html-links. More...
 
MOBI_RET mobi_iterate_txtparts (MOBIRawml *rawml, MOBI_RET(*cb)(MOBIPart *))
 Call callback function for each text record. More...
 
MOBI_RET mobi_markup_to_utf8 (MOBIPart *part)
 Convert MOBIPart part data to utf8. More...
 
MOBI_RET mobi_strip_mobitags (MOBIPart *part)
 Strip unneeded tags from html. Currently only <aid> More...
 
MOBI_RET mobi_parse_rawml (MOBIRawml *rawml, const MOBIData *m)
 Parse raw records into html flow parts, markup parts, resources and indices. More...
 
MOBI_RET mobi_parse_rawml_opt (MOBIRawml *rawml, const MOBIData *m, bool parse_toc, bool parse_dict, bool reconstruct)
 Parse raw records into html flow parts, markup parts, resources and indices. Individual stages of the parsing may be turned on/off. More...
 

Detailed Description

Functions for parsing rawml markup.

Copyright (c) 2020 Bartek Fabiszewski http://www.fabiszewski.net

This file is part of libmobi. Licensed under LGPL, either version 3, or any later. See http://www.gnu.org/licenses/

Function Documentation

◆ mobi_embed_to_link()

MOBI_RET mobi_embed_to_link ( char *  link,
const MOBIRawml rawml,
const char *  value 
)

Replace kindle:embed link with html href.

Parameters
[in,out]linkMemory area which will be filled with "resource00000.ext", including quotation marks
[in]rawmlStructure rawml
[in]valueString kindle:embed:0000?mime=type, with optional quotation marks
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_find_attrname()

MOBI_RET mobi_find_attrname ( MOBIResult result,
const unsigned char *  data_start,
const unsigned char *  data_end,
const char *  attrname 
)

Find first occurence of markup attribute with given name.

Parameters
[in,out]resultMOBIResult structure will be filled with found data
[in]data_startBeginning of the memory area to search in
[in]data_endEnd of the memory area to search in
[in]attrnameString to find (len < MOBI_ATTRNAME_MAXSIZE)
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_find_attrvalue()

MOBI_RET mobi_find_attrvalue ( MOBIResult result,
const unsigned char *  data_start,
const unsigned char *  data_end,
const MOBIFiletype  type,
const char *  needle 
)

Find first occurence of markup attribute with given value.

Parameters
[in,out]resultMOBIResult structure will be filled with found data
[in]data_startBeginning of the memory area to search in
[in]data_endEnd of the memory area to search in
[in]typeType of data (T_HTML or T_CSS)
[in]needleString to find (len <= MOBI_ATTRNAME_MAXSIZE)
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_flow_to_link()

MOBI_RET mobi_flow_to_link ( char *  link,
const MOBIRawml rawml,
const char *  value 
)

Replace kindle:flow link with html href.

Parameters
[in,out]linkMemory area which will be filled with "part00000.ext", including quotation marks
[in]rawmlStructure rawml
[in]valueString kindle:flow:0000?mime=type, without quotation marks
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_get_aid_by_offset()

MOBI_RET mobi_get_aid_by_offset ( char *  aid,
const MOBIPart html,
const size_t  offset 
)

Get value of the closest "aid" attribute following given offset in a given part.

Parameters
[in,out]aidString value of "aid" attribute
[in]htmlMOBIPart html part
[in]offsetOffset from the beginning of the part data
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_get_aid_by_posoff()

MOBI_RET mobi_get_aid_by_posoff ( uint32_t *  file_number,
char *  aid,
const MOBIRawml rawml,
const size_t  pos_fid,
const size_t  pos_off 
)

Convert kindle:pos:fid:x:off:y to html file number and closest "aid" attribute following the position.

Parameters
[in,out]file_numberWill be set to file number value
[in,out]aidString value of "aid" attribute
[in]rawmlMOBIRawml parsed records structure
[in]pos_fidX value of pos:fid:x
[in]pos_offY value of off:y
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_get_aid_offset()

size_t mobi_get_aid_offset ( const MOBIPart html,
const char *  aid 
)

Get offset of the given value of an "aid" attribute in a given part.

Parameters
[in]aidString value of "aid" attribute
[in]htmlMOBIPart html part
Returns
Offset from the beginning of the html part data, SIZE_MAX on failure

◆ mobi_get_attribute_value()

size_t mobi_get_attribute_value ( char *  value,
const unsigned char *  data,
const size_t  size,
const char *  attribute,
bool  only_quoted 
)

Get value and offset of the first found attribute with given name.

Parameters
[in,out]valueString value of the attribute, will be filled by the function, zero length if not found
[in]dataData to search in
[in]sizeData size
[in]attributeAttribute name
[in]only_quotedRequire the value to be quoted if true, allow no quotes (eg. filepos=00001) if false
Returns
Offset from the beginning of the data, SIZE_MAX if not found

◆ mobi_get_filepos_array()

MOBI_RET mobi_get_filepos_array ( MOBIArray links,
const MOBIPart part 
)

Scan html part and build array of filepos link target offsets.

Parameters
[in,out]linksMOBIArray structure for link target offsets array
[in]partMOBIPart html part structure
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_get_id_by_offset()

MOBI_RET mobi_get_id_by_offset ( char *  id,
const MOBIPart html,
const size_t  offset,
MOBIAttrType pref_attr 
)

Get value of the closest "id" or "name" attribute following given offset in a given part.

Parameters
[in,out]idString value of found attribute
[in]htmlMOBIPart html part
[in]offsetOffset from the beginning of the part data
[in,out]pref_attrPreferred attribute to link to (id or name)
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_get_id_by_posoff()

MOBI_RET mobi_get_id_by_posoff ( uint32_t *  file_number,
char *  id,
const MOBIRawml rawml,
const size_t  pos_fid,
const size_t  pos_off,
MOBIAttrType pref_attr 
)

Convert kindle:pos:fid:x:off:y to html file number and closest "id" attribute following the position.

Parameters
[in,out]file_numberWill be set to file number value
[in,out]idString value of "id" attribute
[in]rawmlMOBIRawml parsed records structure
[in]pos_fidX value of pos:fid:x
[in]pos_offY value of off:y
[in,out]pref_attrAttribute to link to
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_get_ncx_filepos_array()

MOBI_RET mobi_get_ncx_filepos_array ( MOBIArray links,
const MOBIRawml rawml 
)

Scan ncx part and build array of filepos link target offsets.

Parameters
[in,out]linksMOBIArray structure for link target offsets array
[in]rawmlMOBIRawml parsed records structure
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_get_offset_by_posoff()

MOBI_RET mobi_get_offset_by_posoff ( uint32_t *  file_number,
size_t *  offset,
const MOBIRawml rawml,
const size_t  pos_fid,
const size_t  pos_off 
)

Convert kindle:pos:fid:x:off:y to skeleton part number and offset from the beginning of the part.

Parameters
[in,out]file_numberWill be set to file number value
[in,out]offsetOffset from the beginning of the skeleton part
[in]rawmlMOBIRawml parsed records structure
[in]pos_fidX value of pos:fid:x
[in]pos_offX value of pos:off:x
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_get_rawlink_location()

size_t mobi_get_rawlink_location ( const MOBIRawml rawml,
const uint32_t  pos_fid,
const uint32_t  pos_off 
)

Convert kindle:pos:fid:x:off:y to offset in rawml raw text file.

Parameters
[in]rawmlMOBIRawml parsed records structure
[in]pos_fidX value of pos:fid:x
[in]pos_offY value of off:y
Returns
Offset in rawml buffer on success, SIZE_MAX otherwise

◆ mobi_iterate_txtparts()

MOBI_RET mobi_iterate_txtparts ( MOBIRawml rawml,
MOBI_RET(*)(MOBIPart *)  cb 
)

Call callback function for each text record.

Parameters
[in,out]rawmlStructure rawml will be filled with reconstructed parts and resources
[in,out]cbCallback function
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_markup_to_utf8()

MOBI_RET mobi_markup_to_utf8 ( MOBIPart part)

Convert MOBIPart part data to utf8.

Parameters
[in,out]partMOBIPart part
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_posfid_to_link()

MOBI_RET mobi_posfid_to_link ( char *  link,
const MOBIRawml rawml,
const char *  value,
MOBIAttrType pref_attr 
)

Replace kindle:pos link with html href.

Parameters
[in,out]linkMemory area which will be filled with "part00000.html#customid", including quotation marks
[in]rawmlStructure rawml
[in]valueString kindle:pos:fid:0000:off:0000000000, without quotation marks
[in,out]pref_attrPreferred attribute to link to (id or name)
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_process_replica()

MOBI_RET mobi_process_replica ( unsigned char *  pdf,
const char *  text,
size_t *  length 
)

Parse Replica Print ebook (azw4). Extract pdf.

Todo:
Parse remaining data from the file
Parameters
[in,out]pdfMemory area will be filled with extracted pdf data
[in]textRaw decompressed text to be parsed
[in,out]lengthText length. Will be updated with pdf_length on return
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_reconstruct_flow()

MOBI_RET mobi_reconstruct_flow ( MOBIRawml rawml,
const char *  text,
const size_t  length 
)

Parse raw text into flow parts.

Parameters
[in,out]rawmlStructure rawml->flow will be filled with parsed flow text parts
[in]textRaw decompressed text to be parsed
[in]lengthText length
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_reconstruct_infl()

MOBI_RET mobi_reconstruct_infl ( char *  outstring,
const MOBIIndx infl,
const MOBIIndexEntry orth_entry 
)

Get infl index markup for given orth entry.

Parameters
[in,out]outstringReconstructed tag <idx:infl>
[in]inflMOBIIndx structure with parsed infl index
[in]orth_entryOrth index entry
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_reconstruct_infl_v1()

MOBI_RET mobi_reconstruct_infl_v1 ( char *  outstring,
MOBITrie *const  infl_tree,
const MOBIIndexEntry orth_entry 
)

Get infl index markup for given orth entry.

This function is inflections scheme used in older mobipocket dictionaries

Parameters
[in,out]outstringReconstructed tag <idx:infl>
[in]infl_treeMOBITrie structure with inflection rules
[in]orth_entryOrth index entry
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_reconstruct_links()

MOBI_RET mobi_reconstruct_links ( const MOBIRawml rawml)

Replace offset-links with html-links.

Parameters
[in,out]rawmlStructure rawml will be filled with reconstructed parts and resources
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_reconstruct_links_kf7()

MOBI_RET mobi_reconstruct_links_kf7 ( const MOBIRawml rawml)

Replace offset-links with html-links in KF7 markup. Also reconstruct dictionary markup if present.

Parameters
[in,out]rawmlStructure rawml will be filled with reconstructed parts and resources
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_reconstruct_links_kf8()

MOBI_RET mobi_reconstruct_links_kf8 ( const MOBIRawml rawml)

Replace offset-links with html-links in KF8 markup.

Parameters
[in,out]rawmlStructure rawml will be filled with reconstructed parts and resources
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_reconstruct_orth()

MOBI_RET mobi_reconstruct_orth ( const MOBIRawml rawml,
MOBIFragment first,
size_t *  new_size 
)

Insert orth index markup to linked list of fragments.

Parameters
[in]rawmlStructure rawml contains orth index data
[in,out]firstFirst element of the linked list
[in,out]new_sizeCounter to be updated with inserted fragments size
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_reconstruct_parts()

MOBI_RET mobi_reconstruct_parts ( MOBIRawml rawml)

Parse raw html into html parts. Use index entries if present to parse file.

Parameters
[in,out]rawmlStructure rawml->markup will be filled with reconstructed html parts
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_reconstruct_resources()

MOBI_RET mobi_reconstruct_resources ( const MOBIData m,
MOBIRawml rawml 
)

Parse resource records (images, fonts etc), determine their type, link to rawml.

Parameters
[in]mMOBIData structure with loaded Record(s) 0 headers
[in,out]rawmlStructure rawml->resources will be filled with parsed resources metadata and linked records data
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_search_links_kf7()

MOBI_RET mobi_search_links_kf7 ( MOBIResult result,
const unsigned char *  data_start,
const unsigned char *  data_end 
)

Find first occurence of attribute to be replaced in KF7 html.

It searches for filepos and recindex attributes

Parameters
[in,out]resultMOBIResult structure will be filled with found data
[in]data_startBeginning of the memory area to search in
[in]data_endEnd of the memory area to search in
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_search_links_kf8()

MOBI_RET mobi_search_links_kf8 ( MOBIResult result,
const unsigned char *  data_start,
const unsigned char *  data_end,
const MOBIFiletype  type 
)

Find first occurence of attribute part to be replaced in KF8 html/css.

It searches for "kindle:" value in attributes

Parameters
[in,out]resultMOBIResult structure will be filled with found data
[in]data_startBeginning of the memory area to search in
[in]data_endEnd of the memory area to search in
[in]typeType of data (T_HTML or T_CSS)
Returns
MOBI_RET status code (on success MOBI_SUCCESS)

◆ mobi_strip_mobitags()

MOBI_RET mobi_strip_mobitags ( MOBIPart part)

Strip unneeded tags from html. Currently only <aid>

Parameters
[in,out]partMOBIPart structure
Returns
MOBI_RET status code (on success MOBI_SUCCESS)