Tag Archives: regex

Original Post

So I wanted to build a link parser like the one on Facebook, but didn’t find one that suited me. So I built one. My code is based off the code found here, but I rewrote much of it to be cleaner and to return JSON rather than HTML.

Code Change – Feb 2nd, 2011
Added some refinements to the cleaning mechanism and greatly speed up image parser.

HTML

<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js"></script>
<style>
	#atc_bar{width:500px;}
	#attach_content{border:1px solid #ccc;padding:10px;margin-top:10px;}
	#atc_images {width:100px;height:120px;overflow:hidden;float:left;}
	#atc_info {width:350px;float:left;height:100px;text-align:left; padding:10px;}
	#atc_title {font-size:14px;display:block;}
	#atc_url {font-size:10px;display:block;}
	#atc_desc {font-size:12px;}
	#atc_total_image_nav{float:left;padding-left:20px}
	#atc_total_images_info{float:left;padding:4px 10px;font-size:12px;}
</style>
<br /><br /><br /><br />

<div align="center">
	<h1>Parse a Link Like Facebook with PHP and Jquery</h1>
	<div id="atc_bar" align="center">
		Paste Link Here: <input type="text" name="url" size="40" id="url" value="" />
		<input type="button" name="attach" value="Parse" id="attach" />
		<input type="hidden" name="cur_image" id="cur_image" />
		<div id="loader">

			<div align="center" id="atc_loading" style="display:none"><img src="load.gif" alt="Loading" /></div>
			<div id="attach_content" style="display:none">
				<div id="atc_images"></div>
				<div id="atc_info">

					<label id="atc_title"></label>
					<label id="atc_url"></label>
					<br clear="all" />
					<label id="atc_desc"></label>
					<br clear="all" />
				</div>
				<div id="atc_total_image_nav" >
					<a href="#" id="prev"><img src="prev.png"  alt="Prev" border="0" /></a><a href="#" id="next"><img src="next.png" alt="Next" border="0" /></a>
				</div>

				<div id="atc_total_images_info" >
					Showing <span id="cur_image_num">1</span> of <span id="atc_total_images">1</span> images
				</div>
				<br clear="all" />
			</div>
		</div>
		<br clear="all" />
	</div>
</div>

JavaScript


<script>

	$(document).ready(function(){

		// delete event
		$('#attach').bind("click", parse_link);

		function parse_link ()
		{
			if(!isValidURL($('#url').val()))
			{
				alert('Please enter a valid url.');
				return false;
			}
			else
			{
				$('#atc_loading').show();
				$('#atc_url').html($('#url').val());
				$.post("fetch.php?url="+escape($('#url').val()), {}, function(response){

					//Set Content
					$('#atc_title').html(response.title);
					$('#atc_desc').html(response.description);
					$('#atc_price').html(response.price);

					$('#atc_total_images').html(response.total_images);

					$('#atc_images').html(' ');
					$.each(response.images, function (a, b)
					{
						$('#atc_images').append('<img src="'+b.img+'" width="100" id="'+(a+1)+'">');
					});
					$('#atc_images img').hide();

					//Flip Viewable Content
					$('#attach_content').fadeIn('slow');
					$('#atc_loading').hide();

					//Show first image
					$('img#1').fadeIn();
					$('#cur_image').val(1);
					$('#cur_image_num').html(1);

					// next image
					$('#next').unbind('click');
					$('#next').bind("click", function(){

						var total_images = parseInt($('#atc_total_images').html());
						if (total_images > 0)
						{
							var index = $('#cur_image').val();
							$('img#'+index).hide();
							if(index < total_images)
							{
								new_index = parseInt(index)+parseInt(1);
							}
							else
							{
								new_index = 1;
							}

							$('#cur_image').val(new_index);
							$('#cur_image_num').html(new_index);
							$('img#'+new_index).show();
						}
					});

					// prev image
					$('#prev').unbind('click');
					$('#prev').bind("click", function(){

						var total_images = parseInt($('#atc_total_images').html());
						if (total_images > 0)
						{
							var index = $('#cur_image').val();
							$('img#'+index).hide();
							if(index > 1)
							{
								new_index = parseInt(index)-parseInt(1);;
							}
							else
							{
								new_index = total_images;
							}

							$('#cur_image').val(new_index);
							$('#cur_image_num').html(new_index);
							$('img#'+new_index).show();
					 	}
					});
				});
			}
		};
	});

	function isValidURL(url)
	{
		var RegExp = /(ftp|http|https):\/\/(\w+:{0,1}\w*@)?(\S+)(:[0-9]+)?(\/|\/([\w#!:.?+=&%@!\-\/]))?/;

		if(RegExp.test(url)){
			return true;
		}else{
			return false;
		}
	}
</script>

PHP

$url = urldecode($_REQUEST['url']);
$url = checkValues($url);
$return_array = array();

$base_url = substr($url,0, strpos($url, "/",8));
$relative_url = substr($url,0, strrpos($url, "/")+1);

// Get Data
$cc = new cURL();
$string = $cc->get($url);
$string = str_replace(array("\n","\r","\t",'</span>','</div>'), '', $string);

$string = preg_replace('/(<(div|span)\s[^>]+\s?>)/',  '', $string);
if (mb_detect_encoding($string, "UTF-8") != "UTF-8")
	$string = utf8_encode($string);

// Parse Title
$nodes = extract_tags( $string, 'title' );
$return_array['title'] = trim($nodes[0]['contents']);

// Parse Base
$base_override = false;
$base_regex = '/<base[^>]*'.'href=[\"|\'](.*)[\"|\']/Ui';
preg_match_all($base_regex, $string, $base_match, PREG_PATTERN_ORDER);
if(strlen($base_match[1][0]) > 0)
{
	$base_url = $base_match[1][0];
	$base_override = true;
}

// Parse Description
$return_array['description'] = '';
$nodes = extract_tags( $string, 'meta' );
foreach($nodes as $node)
{
	if (strtolower($node['attributes']['name']) == 'description')
		$return_array['description'] = trim($node['attributes']['content']);
}

// Parse Images
$images_array = extract_tags( $string, 'img' );
$images = array();
for ($i=0;$i<=sizeof($images_array);$i++)
{
	$img = trim(@$images_array[$i]['attributes']['src']);
	$width = preg_replace("/[^0-9.]/", '', $images_array[$i]['attributes']['width']);
	$height = preg_replace("/[^0-9.]/", '', $images_array[$i]['attributes']['height']);

	$ext = trim(pathinfo($img, PATHINFO_EXTENSION));

	if($img && $ext != 'gif')
	{
		if (substr($img,0,7) == 'http://')
			;
		else	if (substr($img,0,1) == '/' || $base_override)
			$img = $base_url . $img;
		else
			$img = $relative_url . $img;

		if ($width == '' && $height == '')
		{
			$details = @getimagesize($img);

			if(is_array($details))
			{
				list($width, $height, $type, $attr) = $details;
			}
		}
		$width = intval($width);
		$height = intval($height);

		if ($width > 199 || $height > 199 )
		{
			if (
				(($width > 0 && $height > 0 && (($width / $height) < 3) && (($width / $height) > .2))
					|| ($width > 0 && $height == 0 && $width < 700)
					|| ($width == 0 && $height > 0 && $height < 700)
				)
				&& strpos($img, 'logo') === false )
			{
				$images[] = array("img" => $img, "width" => $width, "height" => $height, 'area' =>  ($width * $height),'offset' => $images_array[$i]['offset']);
			}
		}

	}
}
$return_array['images'] = array_values(($images));
$return_array['total_images'] = count($return_array['images']);

header('Cache-Control: no-cache, must-revalidate');
header('Expires: Mon, 26 Jul 1997 05:00:00 GMT');
header('Content-type: application/json');

echo json_encode($return_array);
exit;