arXiv2bib does for still-unpublished (or never published) papers from the arXiv what doi2bib does for published papers, i.e., it returns a bibTeX entry ready to be put in the bib collection (it does so with option -w).
For example:
laussy@azag:~$ ./arXiv2bib 2402.01627 -w
@Article{arXiv_casalengua24a,
author = {E. {Zubizarreta Casalengua} and F. P. Laussy},
title = {Spatial correlations of vortex quantum states},
journal = {arXiv:2402.01627},
year = {2024},
pdf = {arXiv/arXiv_casalengua24a},
url = {doi:10.48550/arXiv.2402.01627}
}
====
arXiv:2402.01627
arXiv_casalengua24a
arXiv_casalengua24a added at the top of /home/laussy/bib/arXiv.bib
This can then be exported for laussy.org with bib2wiki:
laussy@azag:~$ bib2wiki arXiv_casalengua24a
arXiv_casalengua24a
<wz tip="arXiv:2402.01627">[[File:arXXXiv.png|14px|link=]]</wz><u>[[arXiv_casalengua24a|Spatial correlations of vortex quantum states]]</u>. [[E. Zubizarreta Casalengua]] and [[F. P. Laussy]] in {{arXiv|2402.01627}} ([[2024]]).
which outputs:
Spatial correlations of vortex quantum states. E. Zubizarreta Casalengua and F. P. Laussy in arXiv:2402.01627 (2024).
Note that in the example above, as per my naming conventions, it should really be arXiv_zubizarettacasalengua24a but I'll leave it at that for now.
There is not yet a way to check for already existing keys, like doi2bib, so as to increment the last letter.
Insertion in the file is also not sorted.
I worked out the first version from doi2bib with the help of both Grok and ChatGPT, and came up with the first archivable version as v°0.8.8:
#!/usr/bin/perl -X -s
# __ ___ ____ _ _ _
# __ _ _ _\ \/ (_)_ _|___ \| |__ (_) |__
# / _` | '__\ /| \ \ / / __) | '_ \| | '_ \
#| (_| | | / \| |\ V / / __/| |_) | | |_) |
# \__,_|_| /_/\_\_| \_/ |_____|_.__/|_|_.__/
# F.P. Laussy - fabrice.laussy@gmailcom
# Adapted from doi2bib by Grok - Sun Sep 21 2025
# v0.8.8 Sun Sep 21 2025
use strict;
use warnings;
my $write_flag = 0;
my $id = '';
# Parse arguments
foreach my $arg (@ARGV) {
if ($arg eq '-w') {
$write_flag = 1;
} elsif ($arg =~ /(?:arXiv:)?((?:[\w\-]+\/)?\d+(?:\.\d+)?)(?:v\d+)?/i) {
$id = $1;
}
}
if ($id eq '') {
print STDERR "Please provide me with an arXiv identifier, e.g.,\n";
print STDERR "arXiv2bib 2409.12952\n";
exit(-1);
}
# path to important substitution files
my $iso4 = "/home/laussy/bib/doi2bib/iso4";
my $bibnames = "/home/laussy/bib/doi2bib/bibnames";
my $bibfile = "/home/laussy/bib/arXiv.bib";
# make a temporary directory to store files
my $dest = "arXiv2bib-".`uniqname`;
chomp $dest;
mkdir $dest or die "Cannot create directory $dest: $!";
chdir $dest or die "Cannot change to directory $dest: $!";
# set initial DOI for arXiv
my $doi = "10.48550/arXiv.$id";
# download the bibliographic data for arXiv DOI using Crossref REST API
my $curl_cmd = "curl -s -L -H \"Accept: application/json\" -H \"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\" \"https://api.crossref.org/works/$doi\" -o from_doi.json -w \"%{http_code}\"";
my $http_status = `$curl_cmd 2>&1`;
chomp $http_status;
if ($http_status != 200 || ! -f "from_doi.json" || -s "from_doi.json" == 0) {
# Silently skip printing the error, as fallback will handle it
}
# always fetch the arXiv abstract HTML for additional info
my $html_cmd = "curl -s -A \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\" \"https://arxiv.org/abs/$id\" -o article.html 2>/dev/null";
`$html_cmd`;
if (! -f "article.html" || -s "article.html" == 0) {
print STDERR "Error: Failed to download arXiv abstract page for $id\n";
chdir "..";
system("rm -rf $dest");
exit(-1);
}
# extract published DOI if present
my $published_doi = `grep -oP '(?<=name="citation_doi" content=")[^"]+' article.html 2>/dev/null`;
chomp $published_doi;
my $reretched = 0;
if ($published_doi ne "") {
$doi = $published_doi;
$curl_cmd = "curl -s -L -H \"Accept: application/json\" -H \"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\" \"https://api.crossref.org/works/$doi\" -o from_doi.json -w \"%{http_code}\"";
$http_status = `$curl_cmd 2>&1`;
chomp $http_status;
if ($http_status == 200 && -s "from_doi.json" > 0) {
$reretched = 1;
print STDERR "Found published DOI $published_doi, using published metadata\n" if $write_flag;
} else {
print STDERR "Warning: Failed to fetch JSON for published DOI $published_doi, falling back to arXiv metadata\n" if $write_flag;
}
}
# extract journal reference if present
my $journalref = `grep -oP '(?<=Journal reference: ).*?(?=<)' article.html 2>/dev/null`;
chomp $journalref;
# generate bib key base
my $redirect = $write_flag ? "2>&1" : "2>/dev/null";
my $json_failed = 1;
my $bibauthors;
my $bibtitle;
my $bibyear;
my $bibdoi = "10.48550/arXiv.$id"; # Initialize bibdoi
my $bibkey;
if ($http_status == 200 && -f "from_doi.json" && -s "from_doi.json" > 0) {
# Check if JSON is valid
my $json_check = qx{jq -e . from_doi.json >/dev/null 2>&1; echo \$?};
chomp $json_check;
if ($json_check == 0) {
# JSON-based extraction (now using .message paths)
$bibauthors = qx{<from_doi.json jq -r '.message.author | map((.given | split("-") | map(.[0:1] + ".") | join("-")) + " " + .family) | join(" and ")' | sed -f $bibnames 2>/dev/null $redirect};
chomp $bibauthors;
if ($write_flag) {
print STDERR "Raw author output from jq: $bibauthors\n";
}
if ($bibauthors eq "" || $bibauthors eq "null") {
print STDERR "Error: Failed to extract authors for arXiv $id\n" if $write_flag;
$bibauthors = "null";
}
$bibtitle = qx{<from_doi.json jq -r '.message.title[0]' 2>/dev/null $redirect};
chomp $bibtitle;
if ($bibtitle eq "" || $bibtitle eq "null") {
print STDERR "Error: Failed to extract title for arXiv $id\n" if $write_flag;
$bibtitle = "null";
}
$bibyear = qx{<from_doi.json jq -r '.message.published."date-parts"[0][0]' 2>/dev/null $redirect};
chomp $bibyear;
if ($bibyear eq "" || $bibyear eq "null") {
print STDERR "Error: Failed to extract year for arXiv $id\n" if $write_flag;
$bibyear = "0000";
}
$bibdoi = qx{<from_doi.json jq -r '.message.DOI' 2>/dev/null $redirect};
chomp $bibdoi;
if ($bibdoi eq "" || $bibdoi eq "null") {
$bibdoi = "10.48550/arXiv.$id";
}
# Extract first author's last name from formatted bibauthors
my @authors = split / and /, $bibauthors;
$bibkey = "unknown";
if (@authors && $bibauthors ne "null") {
my $first_author = $authors[0];
if ($first_author =~ /.*?\s+([^{}\s]+|\{[^}]+\})$/) {
$bibkey = $1;
$bibkey =~ s/[{}]//g; # Remove braces for compound names
}
}
$json_failed = ($bibauthors eq "null" || $bibtitle eq "null" || $bibkey eq "unknown");
} else {
print STDERR "Error: Invalid JSON for arXiv $id\n" if $write_flag;
}
}
# HTML-based extraction as fallback
if ($json_failed) {
# Extract authors
my @authors = split /\n/, `grep -oP '(?<=name="citation_author" content=")[^"]+' article.html 2>/dev/null`;
my @formatted_authors;
my $first_family = "";
foreach my $author (@authors) {
chomp $author;
if ($author =~ /^([^,]+),\s*(.+)$/) {
my ($surname, $given) = ($1, $2);
$surname =~ s/ü/\\"u/g;
$surname =~ s/ó/\\'o/g;
if ($first_family eq "" && $surname ne "") {
$first_family = $surname;
}
my @given_parts = split /\s+/, $given;
my @initials = map { substr($_, 0, 1) . "." } @given_parts;
my $formatted = join(" ", @initials) . " " . ($surname =~ /\s/ ? "{$surname}" : $surname);
push @formatted_authors, $formatted;
}
}
$bibauthors = join(" and ", @formatted_authors);
if ($bibauthors eq "") {
print STDERR "Error: Failed to extract authors from HTML for arXiv $id\n" if $write_flag;
$bibauthors = "null";
} else {
if (-f $bibnames) {
$bibauthors = `echo "$bibauthors" | sed -f $bibnames 2>/dev/null`;
chomp $bibauthors;
}
}
# Extract title
$bibtitle = `grep -oP '(?<=<meta name="citation_title" content=")[^"]+' article.html 2>/dev/null`;
chomp $bibtitle;
if ($bibtitle eq "") {
$bibtitle = `grep -oP '(?<=<h1 class="title mathjax">\\s*<span class="descriptor">Title:</span>\\s*)[^<]+' article.html 2>/dev/null`;
chomp $bibtitle;
}
if ($bibtitle eq "") {
print STDERR "Error: Failed to extract title from HTML for arXiv $id\n" if $write_flag;
$bibtitle = "null";
}
# Extract year
$bibyear = `grep -oP '(?<=<meta name="citation_date" content=")[0-9]{4}' article.html 2>/dev/null`;
chomp $bibyear;
if ($bibyear eq "") {
$bibyear = `grep -oP '(?<=<div class="submission-history">.*Submitted.*?)\\d{4}' article.html 2>/dev/null`;
chomp $bibyear;
$bibyear =~ s/.*(\d{4})$/$1/;
}
if ($bibyear eq "") {
print STDERR "Error: Failed to extract year from HTML for arXiv $id\n" if $write_flag;
$bibyear = "0000";
}
$bibkey = $first_family || "unknown";
}
# Exit if critical fields are missing
if ($bibauthors eq "null" || $bibtitle eq "null" || $bibyear eq "0000" || $bibkey eq "unknown") {
print STDERR "Error: Failed to extract valid author, title, or year for arXiv $id\n";
chdir "..";
system("rm -rf $dest");
exit(-1);
}
# Clean bibkey for key format
$bibkey =~ s/[^a-zA-Z]//g; # Remove non-letters
$bibkey =~ s/[áàâäãå]/a/g; # Replace accented a
$bibkey =~ s/[éèêë]/e/g; # Replace accented e
$bibkey =~ s/[íìîï]/i/g; # Replace accented i
$bibkey =~ s/[óòôöõ]/o/g; # Replace accented o
$bibkey =~ s/[úùûü]/u/g; # Replace accented u
$bibkey =~ s/[ç]/c/g; # Replace ç
$bibkey =~ s/[ñ]/n/g; # Replace ñ
$bibkey = lc $bibkey;
my $name = $bibkey;
my $bibyear_short = substr($bibyear, -2); # Use last two digits of year
my $basekey = "arXiv_" . $name;
# determine next letter
my %letters;
my $content = '';
my $existing_key = '';
my $exists = 0;
if (-f $bibfile) {
open(my $fh, '<', $bibfile) or die "Cannot open $bibfile: $!";
{ local $/; $content = <$fh>; }
close $fh;
while ($content =~ /\@(\w+)\{([^,]+),\s*(.*?)\n\s*\}/gxis) {
my $type = $1;
my $key = $2;
my $fields = $3;
if ($type !~ /string/i && ($fields =~ /url\s*=\s*\{doi:\Q$bibdoi\E\}/i)) {
$existing_key = $key;
$exists = 1;
last;
}
if ($type !~ /string/i && $key =~ /\QarXiv_$name\E(\d{2})([a-z])/i) {
my $captured_year = $1;
my $captured_letter = $2;
if ($captured_year eq $bibyear_short) {
$letters{lc $captured_letter} = 1;
}
}
}
}
# check for existing entry
if ($exists) {
print STDERR "Entry for arXiv $id already exists with key $existing_key\n";
chdir "..";
system("rm -rf $dest");
exit(0);
}
my @used_letters = sort keys %letters;
my $letter = 'a';
while (grep { $_ eq $letter } @used_letters) {
$letter = chr(ord($letter) + 1);
}
my $bibkey_final = "arXiv_" . $name . $bibyear_short . $letter;
# construct entry
my $entry = "\@Article\{$bibkey_final,\n";
$entry .= " author = {$bibauthors},\n";
$entry .= " title = {$bibtitle},\n";
$entry .= " journal = {arXiv:$id},\n";
$entry .= " year = {$bibyear},\n";
if ($journalref ne "") {
$entry .= " note = {$journalref},\n";
}
$entry .= " pdf = {arXiv/$bibkey_final},\n";
$entry .= " url = {doi:$bibdoi}\n}";
# output entry to stdout
print "$entry\n";
# output to stderr
print STDERR "\n====\n\n";
print STDERR "arXiv:$id\n";
print STDERR "$bibkey_final\n";
if ($write_flag) {
$content = $entry . "\n\n" . $content;
open(my $out, '>', $bibfile) or die "Cannot write to $bibfile: $!";
print $out $content;
close $out;
print STDERR "$bibkey_final added at the top of $bibfile\n";
}
# back to initial directory and remove temporary one
chdir "..";
system("rm -rf $dest");