|
libmobi
C library for handling MOBI format ebook documents
|
Functions for parsing rawml markup. More...
#include <stdlib.h>#include <string.h>#include <ctype.h>#include "parse_rawml.h"#include "util.h"#include "opf.h"#include "structure.h"#include "index.h"#include "debug.h"Macros | |
| #define | _GNU_SOURCE 1 |
| #define | __USE_BSD /* for strdup on linux/glibc */ |
Functions | |
| size_t | mobi_get_rawlink_location (const MOBIRawml *rawml, const uint32_t pos_fid, const uint32_t pos_off) |
| Convert kindle:pos:fid:x:off:y to offset in rawml raw text file. More... | |
| MOBI_RET | mobi_search_links_kf7 (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end) |
| Find first occurence of attribute to be replaced in KF7 html. More... | |
| MOBI_RET | mobi_find_attrvalue (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end, const MOBIFiletype type, const char *needle) |
| Find first occurence of markup attribute with given value. More... | |
| MOBI_RET | mobi_find_attrname (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end, const char *attrname) |
| Find first occurence of markup attribute with given name. More... | |
| MOBI_RET | mobi_search_links_kf8 (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end, const MOBIFiletype type) |
| Find first occurence of attribute part to be replaced in KF8 html/css. More... | |
| size_t | mobi_get_attribute_value (char *value, const unsigned char *data, const size_t size, const char *attribute, bool only_quoted) |
| Get value and offset of the first found attribute with given name. More... | |
| size_t | mobi_get_aid_offset (const MOBIPart *html, const char *aid) |
| Get offset of the given value of an "aid" attribute in a given part. More... | |
| MOBI_RET | mobi_get_offset_by_posoff (uint32_t *file_number, size_t *offset, const MOBIRawml *rawml, const size_t pos_fid, const size_t pos_off) |
| Convert kindle:pos:fid:x:off:y to skeleton part number and offset from the beginning of the part. More... | |
| MOBI_RET | mobi_get_aid_by_offset (char *aid, const MOBIPart *html, const size_t offset) |
| Get value of the closest "aid" attribute following given offset in a given part. More... | |
| MOBI_RET | mobi_get_id_by_offset (char *id, const MOBIPart *html, const size_t offset, MOBIAttrType *pref_attr) |
| Get value of the closest "id" or "name" attribute following given offset in a given part. More... | |
| MOBI_RET | mobi_get_aid_by_posoff (uint32_t *file_number, char *aid, const MOBIRawml *rawml, const size_t pos_fid, const size_t pos_off) |
| Convert kindle:pos:fid:x:off:y to html file number and closest "aid" attribute following the position. More... | |
| MOBI_RET | mobi_get_id_by_posoff (uint32_t *file_number, char *id, const MOBIRawml *rawml, const size_t pos_fid, const size_t pos_off, MOBIAttrType *pref_attr) |
| Convert kindle:pos:fid:x:off:y to html file number and closest "id" attribute following the position. More... | |
| MOBI_RET | mobi_reconstruct_resources (const MOBIData *m, MOBIRawml *rawml) |
| Parse resource records (images, fonts etc), determine their type, link to rawml. More... | |
| MOBI_RET | mobi_process_replica (unsigned char *pdf, const char *text, size_t *length) |
| Parse Replica Print ebook (azw4). Extract pdf. More... | |
| MOBI_RET | mobi_reconstruct_flow (MOBIRawml *rawml, const char *text, const size_t length) |
| Parse raw text into flow parts. More... | |
| MOBI_RET | mobi_reconstruct_parts (MOBIRawml *rawml) |
| Parse raw html into html parts. Use index entries if present to parse file. More... | |
| MOBI_RET | mobi_get_filepos_array (MOBIArray *links, const MOBIPart *part) |
| Scan html part and build array of filepos link target offsets. More... | |
| MOBI_RET | mobi_get_ncx_filepos_array (MOBIArray *links, const MOBIRawml *rawml) |
| Scan ncx part and build array of filepos link target offsets. More... | |
| MOBI_RET | mobi_posfid_to_link (char *link, const MOBIRawml *rawml, const char *value, MOBIAttrType *pref_attr) |
| Replace kindle:pos link with html href. More... | |
| MOBI_RET | mobi_flow_to_link (char *link, const MOBIRawml *rawml, const char *value) |
| Replace kindle:flow link with html href. More... | |
| MOBI_RET | mobi_embed_to_link (char *link, const MOBIRawml *rawml, const char *value) |
| Replace kindle:embed link with html href. More... | |
| MOBI_RET | mobi_reconstruct_links_kf8 (const MOBIRawml *rawml) |
| Replace offset-links with html-links in KF8 markup. More... | |
| MOBI_RET | mobi_reconstruct_infl (char *outstring, const MOBIIndx *infl, const MOBIIndexEntry *orth_entry) |
| Get infl index markup for given orth entry. More... | |
| MOBI_RET | mobi_reconstruct_infl_v1 (char *outstring, MOBITrie *const infl_tree, const MOBIIndexEntry *orth_entry) |
| Get infl index markup for given orth entry. More... | |
| MOBI_RET | mobi_reconstruct_orth (const MOBIRawml *rawml, MOBIFragment *first, size_t *new_size) |
| Insert orth index markup to linked list of fragments. More... | |
| MOBI_RET | mobi_reconstruct_links_kf7 (const MOBIRawml *rawml) |
| Replace offset-links with html-links in KF7 markup. Also reconstruct dictionary markup if present. More... | |
| MOBI_RET | mobi_reconstruct_links (const MOBIRawml *rawml) |
| Replace offset-links with html-links. More... | |
| MOBI_RET | mobi_iterate_txtparts (MOBIRawml *rawml, MOBI_RET(*cb)(MOBIPart *)) |
| Call callback function for each text record. More... | |
| MOBI_RET | mobi_markup_to_utf8 (MOBIPart *part) |
| Convert MOBIPart part data to utf8. More... | |
| MOBI_RET | mobi_strip_mobitags (MOBIPart *part) |
| Strip unneeded tags from html. Currently only <aid> More... | |
| MOBI_RET | mobi_parse_rawml (MOBIRawml *rawml, const MOBIData *m) |
| Parse raw records into html flow parts, markup parts, resources and indices. More... | |
| MOBI_RET | mobi_parse_rawml_opt (MOBIRawml *rawml, const MOBIData *m, bool parse_toc, bool parse_dict, bool reconstruct) |
| Parse raw records into html flow parts, markup parts, resources and indices. Individual stages of the parsing may be turned on/off. More... | |
Functions for parsing rawml markup.
Copyright (c) 2020 Bartek Fabiszewski http://www.fabiszewski.net
This file is part of libmobi. Licensed under LGPL, either version 3, or any later. See http://www.gnu.org/licenses/
Replace kindle:embed link with html href.
| [in,out] | link | Memory area which will be filled with "resource00000.ext", including quotation marks |
| [in] | rawml | Structure rawml |
| [in] | value | String kindle:embed:0000?mime=type, with optional quotation marks |
| MOBI_RET mobi_find_attrname | ( | MOBIResult * | result, |
| const unsigned char * | data_start, | ||
| const unsigned char * | data_end, | ||
| const char * | attrname | ||
| ) |
Find first occurence of markup attribute with given name.
| [in,out] | result | MOBIResult structure will be filled with found data |
| [in] | data_start | Beginning of the memory area to search in |
| [in] | data_end | End of the memory area to search in |
| [in] | attrname | String to find (len < MOBI_ATTRNAME_MAXSIZE) |
| MOBI_RET mobi_find_attrvalue | ( | MOBIResult * | result, |
| const unsigned char * | data_start, | ||
| const unsigned char * | data_end, | ||
| const MOBIFiletype | type, | ||
| const char * | needle | ||
| ) |
Find first occurence of markup attribute with given value.
| [in,out] | result | MOBIResult structure will be filled with found data |
| [in] | data_start | Beginning of the memory area to search in |
| [in] | data_end | End of the memory area to search in |
| [in] | type | Type of data (T_HTML or T_CSS) |
| [in] | needle | String to find (len <= MOBI_ATTRNAME_MAXSIZE) |
Replace kindle:flow link with html href.
| [in,out] | link | Memory area which will be filled with "part00000.ext", including quotation marks |
| [in] | rawml | Structure rawml |
| [in] | value | String kindle:flow:0000?mime=type, without quotation marks |
Get value of the closest "aid" attribute following given offset in a given part.
| [in,out] | aid | String value of "aid" attribute |
| [in] | html | MOBIPart html part |
| [in] | offset | Offset from the beginning of the part data |
| MOBI_RET mobi_get_aid_by_posoff | ( | uint32_t * | file_number, |
| char * | aid, | ||
| const MOBIRawml * | rawml, | ||
| const size_t | pos_fid, | ||
| const size_t | pos_off | ||
| ) |
Convert kindle:pos:fid:x:off:y to html file number and closest "aid" attribute following the position.
| [in,out] | file_number | Will be set to file number value |
| [in,out] | aid | String value of "aid" attribute |
| [in] | rawml | MOBIRawml parsed records structure |
| [in] | pos_fid | X value of pos:fid:x |
| [in] | pos_off | Y value of off:y |
| size_t mobi_get_aid_offset | ( | const MOBIPart * | html, |
| const char * | aid | ||
| ) |
Get offset of the given value of an "aid" attribute in a given part.
| [in] | aid | String value of "aid" attribute |
| [in] | html | MOBIPart html part |
| size_t mobi_get_attribute_value | ( | char * | value, |
| const unsigned char * | data, | ||
| const size_t | size, | ||
| const char * | attribute, | ||
| bool | only_quoted | ||
| ) |
Get value and offset of the first found attribute with given name.
| [in,out] | value | String value of the attribute, will be filled by the function, zero length if not found |
| [in] | data | Data to search in |
| [in] | size | Data size |
| [in] | attribute | Attribute name |
| [in] | only_quoted | Require the value to be quoted if true, allow no quotes (eg. filepos=00001) if false |
| MOBI_RET mobi_get_id_by_offset | ( | char * | id, |
| const MOBIPart * | html, | ||
| const size_t | offset, | ||
| MOBIAttrType * | pref_attr | ||
| ) |
Get value of the closest "id" or "name" attribute following given offset in a given part.
| [in,out] | id | String value of found attribute |
| [in] | html | MOBIPart html part |
| [in] | offset | Offset from the beginning of the part data |
| [in,out] | pref_attr | Preferred attribute to link to (id or name) |
| MOBI_RET mobi_get_id_by_posoff | ( | uint32_t * | file_number, |
| char * | id, | ||
| const MOBIRawml * | rawml, | ||
| const size_t | pos_fid, | ||
| const size_t | pos_off, | ||
| MOBIAttrType * | pref_attr | ||
| ) |
Convert kindle:pos:fid:x:off:y to html file number and closest "id" attribute following the position.
| [in,out] | file_number | Will be set to file number value |
| [in,out] | id | String value of "id" attribute |
| [in] | rawml | MOBIRawml parsed records structure |
| [in] | pos_fid | X value of pos:fid:x |
| [in] | pos_off | Y value of off:y |
| [in,out] | pref_attr | Attribute to link to |
| MOBI_RET mobi_get_offset_by_posoff | ( | uint32_t * | file_number, |
| size_t * | offset, | ||
| const MOBIRawml * | rawml, | ||
| const size_t | pos_fid, | ||
| const size_t | pos_off | ||
| ) |
Convert kindle:pos:fid:x:off:y to skeleton part number and offset from the beginning of the part.
| [in,out] | file_number | Will be set to file number value |
| [in,out] | offset | Offset from the beginning of the skeleton part |
| [in] | rawml | MOBIRawml parsed records structure |
| [in] | pos_fid | X value of pos:fid:x |
| [in] | pos_off | X value of pos:off:x |
| size_t mobi_get_rawlink_location | ( | const MOBIRawml * | rawml, |
| const uint32_t | pos_fid, | ||
| const uint32_t | pos_off | ||
| ) |
Convert kindle:pos:fid:x:off:y to offset in rawml raw text file.
| [in] | rawml | MOBIRawml parsed records structure |
| [in] | pos_fid | X value of pos:fid:x |
| [in] | pos_off | Y value of off:y |
Call callback function for each text record.
| [in,out] | rawml | Structure rawml will be filled with reconstructed parts and resources |
| [in,out] | cb | Callback function |
| MOBI_RET mobi_posfid_to_link | ( | char * | link, |
| const MOBIRawml * | rawml, | ||
| const char * | value, | ||
| MOBIAttrType * | pref_attr | ||
| ) |
Replace kindle:pos link with html href.
| [in,out] | link | Memory area which will be filled with "part00000.html#customid", including quotation marks |
| [in] | rawml | Structure rawml |
| [in] | value | String kindle:pos:fid:0000:off:0000000000, without quotation marks |
| [in,out] | pref_attr | Preferred attribute to link to (id or name) |
| MOBI_RET mobi_process_replica | ( | unsigned char * | pdf, |
| const char * | text, | ||
| size_t * | length | ||
| ) |
Parse Replica Print ebook (azw4). Extract pdf.
| [in,out] | Memory area will be filled with extracted pdf data | |
| [in] | text | Raw decompressed text to be parsed |
| [in,out] | length | Text length. Will be updated with pdf_length on return |
Parse raw text into flow parts.
| [in,out] | rawml | Structure rawml->flow will be filled with parsed flow text parts |
| [in] | text | Raw decompressed text to be parsed |
| [in] | length | Text length |
| MOBI_RET mobi_reconstruct_infl | ( | char * | outstring, |
| const MOBIIndx * | infl, | ||
| const MOBIIndexEntry * | orth_entry | ||
| ) |
Get infl index markup for given orth entry.
| [in,out] | outstring | Reconstructed tag <idx:infl> |
| [in] | infl | MOBIIndx structure with parsed infl index |
| [in] | orth_entry | Orth index entry |
| MOBI_RET mobi_reconstruct_infl_v1 | ( | char * | outstring, |
| MOBITrie *const | infl_tree, | ||
| const MOBIIndexEntry * | orth_entry | ||
| ) |
Get infl index markup for given orth entry.
This function is inflections scheme used in older mobipocket dictionaries
| [in,out] | outstring | Reconstructed tag <idx:infl> |
| [in] | infl_tree | MOBITrie structure with inflection rules |
| [in] | orth_entry | Orth index entry |
Replace offset-links with html-links.
| [in,out] | rawml | Structure rawml will be filled with reconstructed parts and resources |
Replace offset-links with html-links in KF7 markup. Also reconstruct dictionary markup if present.
| [in,out] | rawml | Structure rawml will be filled with reconstructed parts and resources |
Replace offset-links with html-links in KF8 markup.
| [in,out] | rawml | Structure rawml will be filled with reconstructed parts and resources |
| MOBI_RET mobi_reconstruct_orth | ( | const MOBIRawml * | rawml, |
| MOBIFragment * | first, | ||
| size_t * | new_size | ||
| ) |
Insert orth index markup to linked list of fragments.
| [in] | rawml | Structure rawml contains orth index data |
| [in,out] | first | First element of the linked list |
| [in,out] | new_size | Counter to be updated with inserted fragments size |
Parse raw html into html parts. Use index entries if present to parse file.
| [in,out] | rawml | Structure rawml->markup will be filled with reconstructed html parts |
Parse resource records (images, fonts etc), determine their type, link to rawml.
| [in] | m | MOBIData structure with loaded Record(s) 0 headers |
| [in,out] | rawml | Structure rawml->resources will be filled with parsed resources metadata and linked records data |
| MOBI_RET mobi_search_links_kf7 | ( | MOBIResult * | result, |
| const unsigned char * | data_start, | ||
| const unsigned char * | data_end | ||
| ) |
Find first occurence of attribute to be replaced in KF7 html.
It searches for filepos and recindex attributes
| [in,out] | result | MOBIResult structure will be filled with found data |
| [in] | data_start | Beginning of the memory area to search in |
| [in] | data_end | End of the memory area to search in |
| MOBI_RET mobi_search_links_kf8 | ( | MOBIResult * | result, |
| const unsigned char * | data_start, | ||
| const unsigned char * | data_end, | ||
| const MOBIFiletype | type | ||
| ) |
Find first occurence of attribute part to be replaced in KF8 html/css.
It searches for "kindle:" value in attributes
| [in,out] | result | MOBIResult structure will be filled with found data |
| [in] | data_start | Beginning of the memory area to search in |
| [in] | data_end | End of the memory area to search in |
| [in] | type | Type of data (T_HTML or T_CSS) |